1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82 };
83 
84 struct AddressRegs {
85   unsigned char NumVAddrs = 0;
86   bool SBase = false;
87   bool SRsrc = false;
88   bool SOffset = false;
89   bool VAddr = false;
90   bool Addr = false;
91   bool SSamp = false;
92 };
93 
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96 
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98   struct CombineInfo {
99     MachineBasicBlock::iterator I;
100     unsigned EltSize;
101     unsigned Offset;
102     unsigned Width;
103     unsigned Format;
104     unsigned BaseOff;
105     unsigned DMask;
106     InstClassEnum InstClass;
107     unsigned CPol = 0;
108     bool IsAGPR;
109     bool UseST64;
110     int AddrIdx[MaxAddressRegs];
111     const MachineOperand *AddrReg[MaxAddressRegs];
112     unsigned NumAddresses;
113     unsigned Order;
114 
115     bool hasSameBaseAddress(const MachineInstr &MI) {
116       for (unsigned i = 0; i < NumAddresses; i++) {
117         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
118 
119         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
120           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
121               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
122             return false;
123           }
124           continue;
125         }
126 
127         // Check same base pointer. Be careful of subregisters, which can occur
128         // with vectors of pointers.
129         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
130             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
131          return false;
132         }
133       }
134       return true;
135     }
136 
137     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
138       for (unsigned i = 0; i < NumAddresses; ++i) {
139         const MachineOperand *AddrOp = AddrReg[i];
140         // Immediates are always OK.
141         if (AddrOp->isImm())
142           continue;
143 
144         // Don't try to merge addresses that aren't either immediates or registers.
145         // TODO: Should be possible to merge FrameIndexes and maybe some other
146         // non-register
147         if (!AddrOp->isReg())
148           return false;
149 
150         // TODO: We should be able to merge physical reg addresses.
151         if (AddrOp->getReg().isPhysical())
152           return false;
153 
154         // If an address has only one use then there will be on other
155         // instructions with the same address, so we can't merge this one.
156         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
157           return false;
158       }
159       return true;
160     }
161 
162     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
163   };
164 
165   struct BaseRegisters {
166     Register LoReg;
167     Register HiReg;
168 
169     unsigned LoSubReg = 0;
170     unsigned HiSubReg = 0;
171   };
172 
173   struct MemAddress {
174     BaseRegisters Base;
175     int64_t Offset = 0;
176   };
177 
178   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179 
180 private:
181   const GCNSubtarget *STM = nullptr;
182   const SIInstrInfo *TII = nullptr;
183   const SIRegisterInfo *TRI = nullptr;
184   MachineRegisterInfo *MRI = nullptr;
185   AliasAnalysis *AA = nullptr;
186   bool OptimizeAgain;
187 
188   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
189                            const DenseSet<Register> &ARegUses,
190                            const MachineInstr &A, const MachineInstr &B) const;
191   static bool dmasksCanBeCombined(const CombineInfo &CI,
192                                   const SIInstrInfo &TII,
193                                   const CombineInfo &Paired);
194   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
195                                    CombineInfo &Paired, bool Modify = false);
196   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
197                         const CombineInfo &Paired);
198   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
199   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
200                                                      const CombineInfo &Paired);
201   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
202                                                     const CombineInfo &Paired);
203   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
204 
205   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
206 
207   unsigned read2Opcode(unsigned EltSize) const;
208   unsigned read2ST64Opcode(unsigned EltSize) const;
209   MachineBasicBlock::iterator
210   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
211                  MachineBasicBlock::iterator InsertBefore);
212 
213   unsigned write2Opcode(unsigned EltSize) const;
214   unsigned write2ST64Opcode(unsigned EltSize) const;
215   MachineBasicBlock::iterator
216   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
217                   MachineBasicBlock::iterator InsertBefore);
218   MachineBasicBlock::iterator
219   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
220                  MachineBasicBlock::iterator InsertBefore);
221   MachineBasicBlock::iterator
222   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
223                           MachineBasicBlock::iterator InsertBefore);
224   MachineBasicBlock::iterator
225   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
226                       MachineBasicBlock::iterator InsertBefore);
227   MachineBasicBlock::iterator
228   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
229                        MachineBasicBlock::iterator InsertBefore);
230   MachineBasicBlock::iterator
231   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
232                        MachineBasicBlock::iterator InsertBefore);
233   MachineBasicBlock::iterator
234   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
235                         MachineBasicBlock::iterator InsertBefore);
236 
237   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
238                            int32_t NewOffset) const;
239   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
240   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
241   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
242   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
243   /// Promotes constant offset to the immediate by adjusting the base. It
244   /// tries to use a base from the nearby instructions that allows it to have
245   /// a 13bit constant offset which gets promoted to the immediate.
246   bool promoteConstantOffsetToImm(MachineInstr &CI,
247                                   MemInfoMap &Visited,
248                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
249   void addInstToMergeableList(const CombineInfo &CI,
250                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
251 
252   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
253       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
254       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
255       std::list<std::list<CombineInfo>> &MergeableInsts) const;
256 
257 public:
258   static char ID;
259 
260   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
261     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
262   }
263 
264   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
265                                      bool &OptimizeListAgain);
266   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
267 
268   bool runOnMachineFunction(MachineFunction &MF) override;
269 
270   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
271 
272   void getAnalysisUsage(AnalysisUsage &AU) const override {
273     AU.setPreservesCFG();
274     AU.addRequired<AAResultsWrapperPass>();
275 
276     MachineFunctionPass::getAnalysisUsage(AU);
277   }
278 
279   MachineFunctionProperties getRequiredProperties() const override {
280     return MachineFunctionProperties()
281       .set(MachineFunctionProperties::Property::IsSSA);
282   }
283 };
284 
285 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
286   const unsigned Opc = MI.getOpcode();
287 
288   if (TII.isMUBUF(Opc)) {
289     // FIXME: Handle d16 correctly
290     return AMDGPU::getMUBUFElements(Opc);
291   }
292   if (TII.isMIMG(MI)) {
293     uint64_t DMaskImm =
294         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
295     return countPopulation(DMaskImm);
296   }
297   if (TII.isMTBUF(Opc)) {
298     return AMDGPU::getMTBUFElements(Opc);
299   }
300 
301   switch (Opc) {
302   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
303     return 1;
304   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
305     return 2;
306   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
307     return 4;
308   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
309     return 8;
310   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
311   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
312   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
313   case AMDGPU::DS_WRITE_B32_gfx9:
314     return 1;
315   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
316   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
317   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
318   case AMDGPU::DS_WRITE_B64_gfx9:
319     return 2;
320   default:
321     return 0;
322   }
323 }
324 
325 /// Maps instruction opcode to enum InstClassEnum.
326 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
327   switch (Opc) {
328   default:
329     if (TII.isMUBUF(Opc)) {
330       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
331       default:
332         return UNKNOWN;
333       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
334       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
335       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
336       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
337         return BUFFER_LOAD;
338       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
339       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
340       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
341       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
342         return BUFFER_STORE;
343       }
344     }
345     if (TII.isMIMG(Opc)) {
346       // Ignore instructions encoded without vaddr.
347       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
348           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
349         return UNKNOWN;
350       // Ignore BVH instructions
351       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
352         return UNKNOWN;
353       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
354       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
355           TII.isGather4(Opc))
356         return UNKNOWN;
357       return MIMG;
358     }
359     if (TII.isMTBUF(Opc)) {
360       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
361       default:
362         return UNKNOWN;
363       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
364       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
365       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
366       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
367         return TBUFFER_LOAD;
368       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
369       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
370       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
371       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
372         return TBUFFER_STORE;
373       }
374     }
375     return UNKNOWN;
376   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
377   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
378   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
379   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
380     return S_BUFFER_LOAD_IMM;
381   case AMDGPU::DS_READ_B32:
382   case AMDGPU::DS_READ_B32_gfx9:
383   case AMDGPU::DS_READ_B64:
384   case AMDGPU::DS_READ_B64_gfx9:
385     return DS_READ;
386   case AMDGPU::DS_WRITE_B32:
387   case AMDGPU::DS_WRITE_B32_gfx9:
388   case AMDGPU::DS_WRITE_B64:
389   case AMDGPU::DS_WRITE_B64_gfx9:
390     return DS_WRITE;
391   }
392 }
393 
394 /// Determines instruction subclass from opcode. Only instructions
395 /// of the same subclass can be merged together. The merged instruction may have
396 /// a different subclass but must have the same class.
397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
398   switch (Opc) {
399   default:
400     if (TII.isMUBUF(Opc))
401       return AMDGPU::getMUBUFBaseOpcode(Opc);
402     if (TII.isMIMG(Opc)) {
403       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
404       assert(Info);
405       return Info->BaseOpcode;
406     }
407     if (TII.isMTBUF(Opc))
408       return AMDGPU::getMTBUFBaseOpcode(Opc);
409     return -1;
410   case AMDGPU::DS_READ_B32:
411   case AMDGPU::DS_READ_B32_gfx9:
412   case AMDGPU::DS_READ_B64:
413   case AMDGPU::DS_READ_B64_gfx9:
414   case AMDGPU::DS_WRITE_B32:
415   case AMDGPU::DS_WRITE_B32_gfx9:
416   case AMDGPU::DS_WRITE_B64:
417   case AMDGPU::DS_WRITE_B64_gfx9:
418     return Opc;
419   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
420   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
421   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
422   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
423     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
424   }
425 }
426 
427 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
428   AddressRegs Result;
429 
430   if (TII.isMUBUF(Opc)) {
431     if (AMDGPU::getMUBUFHasVAddr(Opc))
432       Result.VAddr = true;
433     if (AMDGPU::getMUBUFHasSrsrc(Opc))
434       Result.SRsrc = true;
435     if (AMDGPU::getMUBUFHasSoffset(Opc))
436       Result.SOffset = true;
437 
438     return Result;
439   }
440 
441   if (TII.isMIMG(Opc)) {
442     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
443     if (VAddr0Idx >= 0) {
444       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
445       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
446     } else {
447       Result.VAddr = true;
448     }
449     Result.SRsrc = true;
450     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
451     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
452       Result.SSamp = true;
453 
454     return Result;
455   }
456   if (TII.isMTBUF(Opc)) {
457     if (AMDGPU::getMTBUFHasVAddr(Opc))
458       Result.VAddr = true;
459     if (AMDGPU::getMTBUFHasSrsrc(Opc))
460       Result.SRsrc = true;
461     if (AMDGPU::getMTBUFHasSoffset(Opc))
462       Result.SOffset = true;
463 
464     return Result;
465   }
466 
467   switch (Opc) {
468   default:
469     return Result;
470   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
471   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
472   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474     Result.SBase = true;
475     return Result;
476   case AMDGPU::DS_READ_B32:
477   case AMDGPU::DS_READ_B64:
478   case AMDGPU::DS_READ_B32_gfx9:
479   case AMDGPU::DS_READ_B64_gfx9:
480   case AMDGPU::DS_WRITE_B32:
481   case AMDGPU::DS_WRITE_B64:
482   case AMDGPU::DS_WRITE_B32_gfx9:
483   case AMDGPU::DS_WRITE_B64_gfx9:
484     Result.Addr = true;
485     return Result;
486   }
487 }
488 
489 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
490                                               const SILoadStoreOptimizer &LSO) {
491   I = MI;
492   unsigned Opc = MI->getOpcode();
493   InstClass = getInstClass(Opc, *LSO.TII);
494 
495   if (InstClass == UNKNOWN)
496     return;
497 
498   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
499 
500   switch (InstClass) {
501   case DS_READ:
502    EltSize =
503           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
504                                                                           : 4;
505    break;
506   case DS_WRITE:
507     EltSize =
508           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
509                                                                             : 4;
510     break;
511   case S_BUFFER_LOAD_IMM:
512     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
513     break;
514   default:
515     EltSize = 4;
516     break;
517   }
518 
519   if (InstClass == MIMG) {
520     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
521     // Offset is not considered for MIMG instructions.
522     Offset = 0;
523   } else {
524     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
525     Offset = I->getOperand(OffsetIdx).getImm();
526   }
527 
528   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
529     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
530 
531   Width = getOpcodeWidth(*I, *LSO.TII);
532 
533   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
534     Offset &= 0xffff;
535   } else if (InstClass != MIMG) {
536     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
537   }
538 
539   AddressRegs Regs = getRegs(Opc, *LSO.TII);
540 
541   NumAddresses = 0;
542   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
543     AddrIdx[NumAddresses++] =
544         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
545   if (Regs.Addr)
546     AddrIdx[NumAddresses++] =
547         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
548   if (Regs.SBase)
549     AddrIdx[NumAddresses++] =
550         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
551   if (Regs.SRsrc)
552     AddrIdx[NumAddresses++] =
553         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
554   if (Regs.SOffset)
555     AddrIdx[NumAddresses++] =
556         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
557   if (Regs.VAddr)
558     AddrIdx[NumAddresses++] =
559         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
560   if (Regs.SSamp)
561     AddrIdx[NumAddresses++] =
562         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
563   assert(NumAddresses <= MaxAddressRegs);
564 
565   for (unsigned J = 0; J < NumAddresses; J++)
566     AddrReg[J] = &I->getOperand(AddrIdx[J]);
567 }
568 
569 } // end anonymous namespace.
570 
571 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
572                       "SI Load Store Optimizer", false, false)
573 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
574 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
575                     false, false)
576 
577 char SILoadStoreOptimizer::ID = 0;
578 
579 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
580 
581 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
582   return new SILoadStoreOptimizer();
583 }
584 
585 static void addDefsUsesToList(const MachineInstr &MI,
586                               DenseSet<Register> &RegDefs,
587                               DenseSet<Register> &RegUses) {
588   for (const auto &Op : MI.operands()) {
589     if (!Op.isReg())
590       continue;
591     if (Op.isDef())
592       RegDefs.insert(Op.getReg());
593     if (Op.readsReg())
594       RegUses.insert(Op.getReg());
595   }
596 }
597 
598 bool SILoadStoreOptimizer::canSwapInstructions(
599     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
600     const MachineInstr &A, const MachineInstr &B) const {
601   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
602       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
603     return false;
604   for (const auto &BOp : B.operands()) {
605     if (!BOp.isReg())
606       continue;
607     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
608       return false;
609     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
610       return false;
611   }
612   return true;
613 }
614 
615 // This function assumes that \p A and \p B have are identical except for
616 // size and offset, and they reference adjacent memory.
617 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
618                                                    const MachineMemOperand *A,
619                                                    const MachineMemOperand *B) {
620   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
621   unsigned Size = A->getSize() + B->getSize();
622   // This function adds the offset parameter to the existing offset for A,
623   // so we pass 0 here as the offset and then manually set it to the correct
624   // value after the call.
625   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
626   MMO->setOffset(MinOffset);
627   return MMO;
628 }
629 
630 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
631                                                const SIInstrInfo &TII,
632                                                const CombineInfo &Paired) {
633   assert(CI.InstClass == MIMG);
634 
635   // Ignore instructions with tfe/lwe set.
636   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
637   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
638 
639   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
640     return false;
641 
642   // Check other optional immediate operands for equality.
643   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
644                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
645                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
646 
647   for (auto op : OperandsToMatch) {
648     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
649     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
650       return false;
651     if (Idx != -1 &&
652         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
653       return false;
654   }
655 
656   // Check DMask for overlaps.
657   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
658   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
659 
660   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
661   if ((1u << AllowedBitsForMin) <= MinMask)
662     return false;
663 
664   return true;
665 }
666 
667 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
668                                        unsigned ComponentCount,
669                                        const GCNSubtarget &STI) {
670   if (ComponentCount > 4)
671     return 0;
672 
673   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
674       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
675   if (!OldFormatInfo)
676     return 0;
677 
678   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
679       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
680                                            ComponentCount,
681                                            OldFormatInfo->NumFormat, STI);
682 
683   if (!NewFormatInfo)
684     return 0;
685 
686   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
687          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
688 
689   return NewFormatInfo->Format;
690 }
691 
692 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
693 // highest power of two. Note that the result is well defined for all inputs
694 // including corner cases like:
695 // - if Lo == Hi, return that value
696 // - if Lo == 0, return 0 (even though the "- 1" below underflows
697 // - if Lo > Hi, return 0 (as if the range wrapped around)
698 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
699   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
700 }
701 
702 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
703                                                 const GCNSubtarget &STI,
704                                                 CombineInfo &Paired,
705                                                 bool Modify) {
706   assert(CI.InstClass != MIMG);
707 
708   // XXX - Would the same offset be OK? Is there any reason this would happen or
709   // be useful?
710   if (CI.Offset == Paired.Offset)
711     return false;
712 
713   // This won't be valid if the offset isn't aligned.
714   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
715     return false;
716 
717   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
718 
719     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
720         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
721     if (!Info0)
722       return false;
723     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
724         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
725     if (!Info1)
726       return false;
727 
728     if (Info0->BitsPerComp != Info1->BitsPerComp ||
729         Info0->NumFormat != Info1->NumFormat)
730       return false;
731 
732     // TODO: Should be possible to support more formats, but if format loads
733     // are not dword-aligned, the merged load might not be valid.
734     if (Info0->BitsPerComp != 32)
735       return false;
736 
737     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
738       return false;
739   }
740 
741   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
742   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
743   CI.UseST64 = false;
744   CI.BaseOff = 0;
745 
746   // Handle all non-DS instructions.
747   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
748     return (EltOffset0 + CI.Width == EltOffset1 ||
749             EltOffset1 + Paired.Width == EltOffset0) &&
750            CI.CPol == Paired.CPol;
751   }
752 
753   // If the offset in elements doesn't fit in 8-bits, we might be able to use
754   // the stride 64 versions.
755   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
756       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
757     if (Modify) {
758       CI.Offset = EltOffset0 / 64;
759       Paired.Offset = EltOffset1 / 64;
760       CI.UseST64 = true;
761     }
762     return true;
763   }
764 
765   // Check if the new offsets fit in the reduced 8-bit range.
766   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
767     if (Modify) {
768       CI.Offset = EltOffset0;
769       Paired.Offset = EltOffset1;
770     }
771     return true;
772   }
773 
774   // Try to shift base address to decrease offsets.
775   uint32_t Min = std::min(EltOffset0, EltOffset1);
776   uint32_t Max = std::max(EltOffset0, EltOffset1);
777 
778   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
779   if (((Max - Min) & ~Mask) == 0) {
780     if (Modify) {
781       // From the range of values we could use for BaseOff, choose the one that
782       // is aligned to the highest power of two, to maximise the chance that
783       // the same offset can be reused for other load/store pairs.
784       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
785       // Copy the low bits of the offsets, so that when we adjust them by
786       // subtracting BaseOff they will be multiples of 64.
787       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
788       CI.BaseOff = BaseOff * CI.EltSize;
789       CI.Offset = (EltOffset0 - BaseOff) / 64;
790       Paired.Offset = (EltOffset1 - BaseOff) / 64;
791       CI.UseST64 = true;
792     }
793     return true;
794   }
795 
796   if (isUInt<8>(Max - Min)) {
797     if (Modify) {
798       // From the range of values we could use for BaseOff, choose the one that
799       // is aligned to the highest power of two, to maximise the chance that
800       // the same offset can be reused for other load/store pairs.
801       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
802       CI.BaseOff = BaseOff * CI.EltSize;
803       CI.Offset = EltOffset0 - BaseOff;
804       Paired.Offset = EltOffset1 - BaseOff;
805     }
806     return true;
807   }
808 
809   return false;
810 }
811 
812 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
813                                      const CombineInfo &CI,
814                                      const CombineInfo &Paired) {
815   const unsigned Width = (CI.Width + Paired.Width);
816   switch (CI.InstClass) {
817   default:
818     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
819   case S_BUFFER_LOAD_IMM:
820     switch (Width) {
821     default:
822       return false;
823     case 2:
824     case 4:
825     case 8:
826       return true;
827     }
828   }
829 }
830 
831 const TargetRegisterClass *
832 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
833   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
834     return TRI->getRegClassForReg(*MRI, Dst->getReg());
835   }
836   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
837     return TRI->getRegClassForReg(*MRI, Src->getReg());
838   }
839   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
840     return TRI->getRegClassForReg(*MRI, Src->getReg());
841   }
842   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
843     return TRI->getRegClassForReg(*MRI, Dst->getReg());
844   }
845   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
846     return TRI->getRegClassForReg(*MRI, Src->getReg());
847   }
848   return nullptr;
849 }
850 
851 /// This function assumes that CI comes before Paired in a basic block. Return
852 /// an insertion point for the merged instruction or nullptr on failure.
853 SILoadStoreOptimizer::CombineInfo *
854 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
855                                            CombineInfo &Paired) {
856   // If another instruction has already been merged into CI, it may now be a
857   // type that we can't do any further merging into.
858   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
859     return nullptr;
860   assert(CI.InstClass == Paired.InstClass);
861 
862   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
863       getInstSubclass(Paired.I->getOpcode(), *TII))
864     return nullptr;
865 
866   // Check both offsets (or masks for MIMG) can be combined and fit in the
867   // reduced range.
868   if (CI.InstClass == MIMG) {
869     if (!dmasksCanBeCombined(CI, *TII, Paired))
870       return nullptr;
871   } else {
872     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
873       return nullptr;
874   }
875 
876   DenseSet<Register> RegDefs;
877   DenseSet<Register> RegUses;
878   CombineInfo *Where;
879   if (CI.I->mayLoad()) {
880     // Try to hoist Paired up to CI.
881     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
882     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
883       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
884         return nullptr;
885     }
886     Where = &CI;
887   } else {
888     // Try to sink CI down to Paired.
889     addDefsUsesToList(*CI.I, RegDefs, RegUses);
890     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
891       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
892         return nullptr;
893     }
894     Where = &Paired;
895   }
896 
897   // Call offsetsCanBeCombined with modify = true so that the offsets are
898   // correct for the new instruction.  This should return true, because
899   // this function should only be called on CombineInfo objects that
900   // have already been confirmed to be mergeable.
901   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
902     offsetsCanBeCombined(CI, *STM, Paired, true);
903   return Where;
904 }
905 
906 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
907   if (STM->ldsRequiresM0Init())
908     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
909   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
910 }
911 
912 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
913   if (STM->ldsRequiresM0Init())
914     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
915 
916   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
917                         : AMDGPU::DS_READ2ST64_B64_gfx9;
918 }
919 
920 MachineBasicBlock::iterator
921 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
922                                      MachineBasicBlock::iterator InsertBefore) {
923   MachineBasicBlock *MBB = CI.I->getParent();
924 
925   // Be careful, since the addresses could be subregisters themselves in weird
926   // cases, like vectors of pointers.
927   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
928 
929   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
930   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
931 
932   unsigned NewOffset0 = CI.Offset;
933   unsigned NewOffset1 = Paired.Offset;
934   unsigned Opc =
935       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
936 
937   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
938   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
939 
940   if (NewOffset0 > NewOffset1) {
941     // Canonicalize the merged instruction so the smaller offset comes first.
942     std::swap(NewOffset0, NewOffset1);
943     std::swap(SubRegIdx0, SubRegIdx1);
944   }
945 
946   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
947          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
948 
949   const MCInstrDesc &Read2Desc = TII->get(Opc);
950 
951   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
952   Register DestReg = MRI->createVirtualRegister(SuperRC);
953 
954   DebugLoc DL = CI.I->getDebugLoc();
955 
956   Register BaseReg = AddrReg->getReg();
957   unsigned BaseSubReg = AddrReg->getSubReg();
958   unsigned BaseRegFlags = 0;
959   if (CI.BaseOff) {
960     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
961     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
962         .addImm(CI.BaseOff);
963 
964     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
965     BaseRegFlags = RegState::Kill;
966 
967     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
968         .addReg(ImmReg)
969         .addReg(AddrReg->getReg(), 0, BaseSubReg)
970         .addImm(0); // clamp bit
971     BaseSubReg = 0;
972   }
973 
974   MachineInstrBuilder Read2 =
975       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
976           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
977           .addImm(NewOffset0)                        // offset0
978           .addImm(NewOffset1)                        // offset1
979           .addImm(0)                                 // gds
980           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
981 
982   (void)Read2;
983 
984   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
985 
986   // Copy to the old destination registers.
987   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
988       .add(*Dest0) // Copy to same destination including flags and sub reg.
989       .addReg(DestReg, 0, SubRegIdx0);
990   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
991       .add(*Dest1)
992       .addReg(DestReg, RegState::Kill, SubRegIdx1);
993 
994   CI.I->eraseFromParent();
995   Paired.I->eraseFromParent();
996 
997   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
998   return Read2;
999 }
1000 
1001 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1002   if (STM->ldsRequiresM0Init())
1003     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1004   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1005                         : AMDGPU::DS_WRITE2_B64_gfx9;
1006 }
1007 
1008 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1009   if (STM->ldsRequiresM0Init())
1010     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1011                           : AMDGPU::DS_WRITE2ST64_B64;
1012 
1013   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1014                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1015 }
1016 
1017 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1018     CombineInfo &CI, CombineInfo &Paired,
1019     MachineBasicBlock::iterator InsertBefore) {
1020   MachineBasicBlock *MBB = CI.I->getParent();
1021 
1022   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1023   // sure we preserve the subregister index and any register flags set on them.
1024   const MachineOperand *AddrReg =
1025       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1026   const MachineOperand *Data0 =
1027       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1028   const MachineOperand *Data1 =
1029       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1030 
1031   unsigned NewOffset0 = CI.Offset;
1032   unsigned NewOffset1 = Paired.Offset;
1033   unsigned Opc =
1034       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1035 
1036   if (NewOffset0 > NewOffset1) {
1037     // Canonicalize the merged instruction so the smaller offset comes first.
1038     std::swap(NewOffset0, NewOffset1);
1039     std::swap(Data0, Data1);
1040   }
1041 
1042   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1043          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1044 
1045   const MCInstrDesc &Write2Desc = TII->get(Opc);
1046   DebugLoc DL = CI.I->getDebugLoc();
1047 
1048   Register BaseReg = AddrReg->getReg();
1049   unsigned BaseSubReg = AddrReg->getSubReg();
1050   unsigned BaseRegFlags = 0;
1051   if (CI.BaseOff) {
1052     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1053     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1054         .addImm(CI.BaseOff);
1055 
1056     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1057     BaseRegFlags = RegState::Kill;
1058 
1059     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1060         .addReg(ImmReg)
1061         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1062         .addImm(0); // clamp bit
1063     BaseSubReg = 0;
1064   }
1065 
1066   MachineInstrBuilder Write2 =
1067       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1068           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1069           .add(*Data0)                               // data0
1070           .add(*Data1)                               // data1
1071           .addImm(NewOffset0)                        // offset0
1072           .addImm(NewOffset1)                        // offset1
1073           .addImm(0)                                 // gds
1074           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1075 
1076   CI.I->eraseFromParent();
1077   Paired.I->eraseFromParent();
1078 
1079   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1080   return Write2;
1081 }
1082 
1083 MachineBasicBlock::iterator
1084 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1085                                      MachineBasicBlock::iterator InsertBefore) {
1086   MachineBasicBlock *MBB = CI.I->getParent();
1087   DebugLoc DL = CI.I->getDebugLoc();
1088   const unsigned Opcode = getNewOpcode(CI, Paired);
1089 
1090   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1091 
1092   Register DestReg = MRI->createVirtualRegister(SuperRC);
1093   unsigned MergedDMask = CI.DMask | Paired.DMask;
1094   unsigned DMaskIdx =
1095       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1096 
1097   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1098   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1099     if (I == DMaskIdx)
1100       MIB.addImm(MergedDMask);
1101     else
1102       MIB.add((*CI.I).getOperand(I));
1103   }
1104 
1105   // It shouldn't be possible to get this far if the two instructions
1106   // don't have a single memoperand, because MachineInstr::mayAlias()
1107   // will return true if this is the case.
1108   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1109 
1110   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1111   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1112 
1113   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1114 
1115   unsigned SubRegIdx0, SubRegIdx1;
1116   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1117 
1118   // Copy to the old destination registers.
1119   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1120   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1121   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1122 
1123   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1124       .add(*Dest0) // Copy to same destination including flags and sub reg.
1125       .addReg(DestReg, 0, SubRegIdx0);
1126   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1127       .add(*Dest1)
1128       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1129 
1130   CI.I->eraseFromParent();
1131   Paired.I->eraseFromParent();
1132   return New;
1133 }
1134 
1135 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1136     CombineInfo &CI, CombineInfo &Paired,
1137     MachineBasicBlock::iterator InsertBefore) {
1138   MachineBasicBlock *MBB = CI.I->getParent();
1139   DebugLoc DL = CI.I->getDebugLoc();
1140   const unsigned Opcode = getNewOpcode(CI, Paired);
1141 
1142   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1143 
1144   Register DestReg = MRI->createVirtualRegister(SuperRC);
1145   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1146 
1147   // It shouldn't be possible to get this far if the two instructions
1148   // don't have a single memoperand, because MachineInstr::mayAlias()
1149   // will return true if this is the case.
1150   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1151 
1152   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1153   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1154 
1155   MachineInstr *New =
1156       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1157           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1158           .addImm(MergedOffset) // offset
1159           .addImm(CI.CPol)      // cpol
1160           .addMemOperand(
1161               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1162 
1163   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1164   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1165   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1166 
1167   // Copy to the old destination registers.
1168   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1169   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1170   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1171 
1172   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1173       .add(*Dest0) // Copy to same destination including flags and sub reg.
1174       .addReg(DestReg, 0, SubRegIdx0);
1175   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1176       .add(*Dest1)
1177       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1178 
1179   CI.I->eraseFromParent();
1180   Paired.I->eraseFromParent();
1181   return New;
1182 }
1183 
1184 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1185     CombineInfo &CI, CombineInfo &Paired,
1186     MachineBasicBlock::iterator InsertBefore) {
1187   MachineBasicBlock *MBB = CI.I->getParent();
1188   DebugLoc DL = CI.I->getDebugLoc();
1189 
1190   const unsigned Opcode = getNewOpcode(CI, Paired);
1191 
1192   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1193 
1194   // Copy to the new source register.
1195   Register DestReg = MRI->createVirtualRegister(SuperRC);
1196   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1197 
1198   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1199 
1200   AddressRegs Regs = getRegs(Opcode, *TII);
1201 
1202   if (Regs.VAddr)
1203     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1204 
1205   // It shouldn't be possible to get this far if the two instructions
1206   // don't have a single memoperand, because MachineInstr::mayAlias()
1207   // will return true if this is the case.
1208   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1209 
1210   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1211   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1212 
1213   MachineInstr *New =
1214     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1215         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1216         .addImm(MergedOffset) // offset
1217         .addImm(CI.CPol)      // cpol
1218         .addImm(0)            // tfe
1219         .addImm(0)            // swz
1220         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1221 
1222   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1223   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1224   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1225 
1226   // Copy to the old destination registers.
1227   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1228   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1229   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1230 
1231   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1232       .add(*Dest0) // Copy to same destination including flags and sub reg.
1233       .addReg(DestReg, 0, SubRegIdx0);
1234   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1235       .add(*Dest1)
1236       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1237 
1238   CI.I->eraseFromParent();
1239   Paired.I->eraseFromParent();
1240   return New;
1241 }
1242 
1243 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1244     CombineInfo &CI, CombineInfo &Paired,
1245     MachineBasicBlock::iterator InsertBefore) {
1246   MachineBasicBlock *MBB = CI.I->getParent();
1247   DebugLoc DL = CI.I->getDebugLoc();
1248 
1249   const unsigned Opcode = getNewOpcode(CI, Paired);
1250 
1251   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1252 
1253   // Copy to the new source register.
1254   Register DestReg = MRI->createVirtualRegister(SuperRC);
1255   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1256 
1257   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1258 
1259   AddressRegs Regs = getRegs(Opcode, *TII);
1260 
1261   if (Regs.VAddr)
1262     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1263 
1264   unsigned JoinedFormat =
1265       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1266 
1267   // It shouldn't be possible to get this far if the two instructions
1268   // don't have a single memoperand, because MachineInstr::mayAlias()
1269   // will return true if this is the case.
1270   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1271 
1272   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1273   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1274 
1275   MachineInstr *New =
1276       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1277           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1278           .addImm(MergedOffset) // offset
1279           .addImm(JoinedFormat) // format
1280           .addImm(CI.CPol)      // cpol
1281           .addImm(0)            // tfe
1282           .addImm(0)            // swz
1283           .addMemOperand(
1284               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1285 
1286   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1287   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1288   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1289 
1290   // Copy to the old destination registers.
1291   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1292   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1293   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1294 
1295   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1296       .add(*Dest0) // Copy to same destination including flags and sub reg.
1297       .addReg(DestReg, 0, SubRegIdx0);
1298   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1299       .add(*Dest1)
1300       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1301 
1302   CI.I->eraseFromParent();
1303   Paired.I->eraseFromParent();
1304   return New;
1305 }
1306 
1307 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1308     CombineInfo &CI, CombineInfo &Paired,
1309     MachineBasicBlock::iterator InsertBefore) {
1310   MachineBasicBlock *MBB = CI.I->getParent();
1311   DebugLoc DL = CI.I->getDebugLoc();
1312 
1313   const unsigned Opcode = getNewOpcode(CI, Paired);
1314 
1315   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1316   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1317   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1318 
1319   // Copy to the new source register.
1320   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1321   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1322 
1323   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1324   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1325 
1326   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1327       .add(*Src0)
1328       .addImm(SubRegIdx0)
1329       .add(*Src1)
1330       .addImm(SubRegIdx1);
1331 
1332   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1333                  .addReg(SrcReg, RegState::Kill);
1334 
1335   AddressRegs Regs = getRegs(Opcode, *TII);
1336 
1337   if (Regs.VAddr)
1338     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1339 
1340   unsigned JoinedFormat =
1341       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1342 
1343   // It shouldn't be possible to get this far if the two instructions
1344   // don't have a single memoperand, because MachineInstr::mayAlias()
1345   // will return true if this is the case.
1346   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1347 
1348   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1349   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1350 
1351   MachineInstr *New =
1352       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1353           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1354           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1355           .addImm(JoinedFormat)                     // format
1356           .addImm(CI.CPol)                          // cpol
1357           .addImm(0)                                // tfe
1358           .addImm(0)                                // swz
1359           .addMemOperand(
1360               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1361 
1362   CI.I->eraseFromParent();
1363   Paired.I->eraseFromParent();
1364   return New;
1365 }
1366 
1367 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1368                                             const CombineInfo &Paired) {
1369   const unsigned Width = CI.Width + Paired.Width;
1370 
1371   switch (CI.InstClass) {
1372   default:
1373     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1374     // FIXME: Handle d16 correctly
1375     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1376                                   Width);
1377   case TBUFFER_LOAD:
1378   case TBUFFER_STORE:
1379     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1380                                   Width);
1381 
1382   case UNKNOWN:
1383     llvm_unreachable("Unknown instruction class");
1384   case S_BUFFER_LOAD_IMM:
1385     switch (Width) {
1386     default:
1387       return 0;
1388     case 2:
1389       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1390     case 4:
1391       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1392     case 8:
1393       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1394     }
1395   case MIMG:
1396     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1397            "No overlaps");
1398     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1399   }
1400 }
1401 
1402 std::pair<unsigned, unsigned>
1403 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1404                                     const CombineInfo &Paired) {
1405   bool ReverseOrder;
1406   if (CI.InstClass == MIMG) {
1407     assert(
1408         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1409         "No overlaps");
1410     ReverseOrder = CI.DMask > Paired.DMask;
1411   } else {
1412     ReverseOrder = CI.Offset > Paired.Offset;
1413   }
1414 
1415   unsigned Idx0;
1416   unsigned Idx1;
1417 
1418   static const unsigned Idxs[5][4] = {
1419       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1420       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1421       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1422       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1423       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1424   };
1425 
1426   assert(CI.Width >= 1 && CI.Width <= 4);
1427   assert(Paired.Width >= 1 && Paired.Width <= 4);
1428 
1429   if (ReverseOrder) {
1430     Idx1 = Idxs[0][Paired.Width - 1];
1431     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1432   } else {
1433     Idx0 = Idxs[0][CI.Width - 1];
1434     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1435   }
1436 
1437   return std::make_pair(Idx0, Idx1);
1438 }
1439 
1440 const TargetRegisterClass *
1441 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1442                                              const CombineInfo &Paired) {
1443   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1444     switch (CI.Width + Paired.Width) {
1445     default:
1446       return nullptr;
1447     case 2:
1448       return &AMDGPU::SReg_64_XEXECRegClass;
1449     case 4:
1450       return &AMDGPU::SGPR_128RegClass;
1451     case 8:
1452       return &AMDGPU::SGPR_256RegClass;
1453     case 16:
1454       return &AMDGPU::SGPR_512RegClass;
1455     }
1456   }
1457 
1458   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1459   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1460              ? TRI->getAGPRClassForBitWidth(BitWidth)
1461              : TRI->getVGPRClassForBitWidth(BitWidth);
1462 }
1463 
1464 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1465     CombineInfo &CI, CombineInfo &Paired,
1466     MachineBasicBlock::iterator InsertBefore) {
1467   MachineBasicBlock *MBB = CI.I->getParent();
1468   DebugLoc DL = CI.I->getDebugLoc();
1469 
1470   const unsigned Opcode = getNewOpcode(CI, Paired);
1471 
1472   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1473   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1474   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1475 
1476   // Copy to the new source register.
1477   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1478   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1479 
1480   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1481   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1482 
1483   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1484       .add(*Src0)
1485       .addImm(SubRegIdx0)
1486       .add(*Src1)
1487       .addImm(SubRegIdx1);
1488 
1489   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1490                  .addReg(SrcReg, RegState::Kill);
1491 
1492   AddressRegs Regs = getRegs(Opcode, *TII);
1493 
1494   if (Regs.VAddr)
1495     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1496 
1497 
1498   // It shouldn't be possible to get this far if the two instructions
1499   // don't have a single memoperand, because MachineInstr::mayAlias()
1500   // will return true if this is the case.
1501   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1502 
1503   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1504   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1505 
1506   MachineInstr *New =
1507     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1508         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1509         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1510         .addImm(CI.CPol)      // cpol
1511         .addImm(0)            // tfe
1512         .addImm(0)            // swz
1513         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1514 
1515   CI.I->eraseFromParent();
1516   Paired.I->eraseFromParent();
1517   return New;
1518 }
1519 
1520 MachineOperand
1521 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1522   APInt V(32, Val, true);
1523   if (TII->isInlineConstant(V))
1524     return MachineOperand::CreateImm(Val);
1525 
1526   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1527   MachineInstr *Mov =
1528   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1529           TII->get(AMDGPU::S_MOV_B32), Reg)
1530     .addImm(Val);
1531   (void)Mov;
1532   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1533   return MachineOperand::CreateReg(Reg, false);
1534 }
1535 
1536 // Compute base address using Addr and return the final register.
1537 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1538                                            const MemAddress &Addr) const {
1539   MachineBasicBlock *MBB = MI.getParent();
1540   MachineBasicBlock::iterator MBBI = MI.getIterator();
1541   DebugLoc DL = MI.getDebugLoc();
1542 
1543   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1544           Addr.Base.LoSubReg) &&
1545          "Expected 32-bit Base-Register-Low!!");
1546 
1547   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1548           Addr.Base.HiSubReg) &&
1549          "Expected 32-bit Base-Register-Hi!!");
1550 
1551   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1552   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1553   MachineOperand OffsetHi =
1554     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1555 
1556   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1557   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1558   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1559 
1560   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1561   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1562   MachineInstr *LoHalf =
1563     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1564       .addReg(CarryReg, RegState::Define)
1565       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1566       .add(OffsetLo)
1567       .addImm(0); // clamp bit
1568   (void)LoHalf;
1569   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1570 
1571   MachineInstr *HiHalf =
1572   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1573     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1574     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1575     .add(OffsetHi)
1576     .addReg(CarryReg, RegState::Kill)
1577     .addImm(0); // clamp bit
1578   (void)HiHalf;
1579   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1580 
1581   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1582   MachineInstr *FullBase =
1583     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1584       .addReg(DestSub0)
1585       .addImm(AMDGPU::sub0)
1586       .addReg(DestSub1)
1587       .addImm(AMDGPU::sub1);
1588   (void)FullBase;
1589   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1590 
1591   return FullDestReg;
1592 }
1593 
1594 // Update base and offset with the NewBase and NewOffset in MI.
1595 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1596                                                Register NewBase,
1597                                                int32_t NewOffset) const {
1598   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1599   Base->setReg(NewBase);
1600   Base->setIsKill(false);
1601   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1602 }
1603 
1604 Optional<int32_t>
1605 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1606   if (Op.isImm())
1607     return Op.getImm();
1608 
1609   if (!Op.isReg())
1610     return None;
1611 
1612   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1613   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1614       !Def->getOperand(1).isImm())
1615     return None;
1616 
1617   return Def->getOperand(1).getImm();
1618 }
1619 
1620 // Analyze Base and extracts:
1621 //  - 32bit base registers, subregisters
1622 //  - 64bit constant offset
1623 // Expecting base computation as:
1624 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1625 //   %LO:vgpr_32, %c:sreg_64_xexec =
1626 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1627 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1628 //   %Base:vreg_64 =
1629 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1630 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1631                                                       MemAddress &Addr) const {
1632   if (!Base.isReg())
1633     return;
1634 
1635   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1636   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1637       || Def->getNumOperands() != 5)
1638     return;
1639 
1640   MachineOperand BaseLo = Def->getOperand(1);
1641   MachineOperand BaseHi = Def->getOperand(3);
1642   if (!BaseLo.isReg() || !BaseHi.isReg())
1643     return;
1644 
1645   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1646   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1647 
1648   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1649       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1650     return;
1651 
1652   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1653   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1654 
1655   auto Offset0P = extractConstOffset(*Src0);
1656   if (Offset0P)
1657     BaseLo = *Src1;
1658   else {
1659     if (!(Offset0P = extractConstOffset(*Src1)))
1660       return;
1661     BaseLo = *Src0;
1662   }
1663 
1664   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1665   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1666 
1667   if (Src0->isImm())
1668     std::swap(Src0, Src1);
1669 
1670   if (!Src1->isImm())
1671     return;
1672 
1673   uint64_t Offset1 = Src1->getImm();
1674   BaseHi = *Src0;
1675 
1676   Addr.Base.LoReg = BaseLo.getReg();
1677   Addr.Base.HiReg = BaseHi.getReg();
1678   Addr.Base.LoSubReg = BaseLo.getSubReg();
1679   Addr.Base.HiSubReg = BaseHi.getSubReg();
1680   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1681 }
1682 
1683 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1684     MachineInstr &MI,
1685     MemInfoMap &Visited,
1686     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1687 
1688   if (!(MI.mayLoad() ^ MI.mayStore()))
1689     return false;
1690 
1691   // TODO: Support flat and scratch.
1692   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1693     return false;
1694 
1695   if (MI.mayLoad() &&
1696       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1697     return false;
1698 
1699   if (AnchorList.count(&MI))
1700     return false;
1701 
1702   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1703 
1704   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1705     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1706     return false;
1707   }
1708 
1709   // Step1: Find the base-registers and a 64bit constant offset.
1710   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1711   MemAddress MAddr;
1712   if (Visited.find(&MI) == Visited.end()) {
1713     processBaseWithConstOffset(Base, MAddr);
1714     Visited[&MI] = MAddr;
1715   } else
1716     MAddr = Visited[&MI];
1717 
1718   if (MAddr.Offset == 0) {
1719     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1720                          " constant offsets that can be promoted.\n";);
1721     return false;
1722   }
1723 
1724   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1725              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1726 
1727   // Step2: Traverse through MI's basic block and find an anchor(that has the
1728   // same base-registers) with the highest 13bit distance from MI's offset.
1729   // E.g. (64bit loads)
1730   // bb:
1731   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1732   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1733   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1734   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1735   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1736   //
1737   // Starting from the first load, the optimization will try to find a new base
1738   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1739   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1740   // as the new-base(anchor) because of the maximum distance which can
1741   // accommodate more intermediate bases presumably.
1742   //
1743   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1744   // (&a + 8192) for load1, load2, load4.
1745   //   addr = &a + 8192
1746   //   load1 = load(addr,       -4096)
1747   //   load2 = load(addr,       -2048)
1748   //   load3 = load(addr,       0)
1749   //   load4 = load(addr,       2048)
1750   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1751   //
1752   MachineInstr *AnchorInst = nullptr;
1753   MemAddress AnchorAddr;
1754   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1755   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1756 
1757   MachineBasicBlock *MBB = MI.getParent();
1758   MachineBasicBlock::iterator E = MBB->end();
1759   MachineBasicBlock::iterator MBBI = MI.getIterator();
1760   ++MBBI;
1761   const SITargetLowering *TLI =
1762     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1763 
1764   for ( ; MBBI != E; ++MBBI) {
1765     MachineInstr &MINext = *MBBI;
1766     // TODO: Support finding an anchor(with same base) from store addresses or
1767     // any other load addresses where the opcodes are different.
1768     if (MINext.getOpcode() != MI.getOpcode() ||
1769         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1770       continue;
1771 
1772     const MachineOperand &BaseNext =
1773       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1774     MemAddress MAddrNext;
1775     if (Visited.find(&MINext) == Visited.end()) {
1776       processBaseWithConstOffset(BaseNext, MAddrNext);
1777       Visited[&MINext] = MAddrNext;
1778     } else
1779       MAddrNext = Visited[&MINext];
1780 
1781     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1782         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1783         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1784         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1785       continue;
1786 
1787     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1788 
1789     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1790     TargetLoweringBase::AddrMode AM;
1791     AM.HasBaseReg = true;
1792     AM.BaseOffs = Dist;
1793     if (TLI->isLegalGlobalAddressingMode(AM) &&
1794         (uint32_t)std::abs(Dist) > MaxDist) {
1795       MaxDist = std::abs(Dist);
1796 
1797       AnchorAddr = MAddrNext;
1798       AnchorInst = &MINext;
1799     }
1800   }
1801 
1802   if (AnchorInst) {
1803     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1804                AnchorInst->dump());
1805     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1806                <<  AnchorAddr.Offset << "\n\n");
1807 
1808     // Instead of moving up, just re-compute anchor-instruction's base address.
1809     Register Base = computeBase(MI, AnchorAddr);
1810 
1811     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1812     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1813 
1814     for (auto P : InstsWCommonBase) {
1815       TargetLoweringBase::AddrMode AM;
1816       AM.HasBaseReg = true;
1817       AM.BaseOffs = P.second - AnchorAddr.Offset;
1818 
1819       if (TLI->isLegalGlobalAddressingMode(AM)) {
1820         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1821                    dbgs() << ")"; P.first->dump());
1822         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1823         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1824       }
1825     }
1826     AnchorList.insert(AnchorInst);
1827     return true;
1828   }
1829 
1830   return false;
1831 }
1832 
1833 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1834                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1835   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1836     if (AddrList.front().InstClass == CI.InstClass &&
1837         AddrList.front().IsAGPR == CI.IsAGPR &&
1838         AddrList.front().hasSameBaseAddress(*CI.I)) {
1839       AddrList.emplace_back(CI);
1840       return;
1841     }
1842   }
1843 
1844   // Base address not found, so add a new list.
1845   MergeableInsts.emplace_back(1, CI);
1846 }
1847 
1848 std::pair<MachineBasicBlock::iterator, bool>
1849 SILoadStoreOptimizer::collectMergeableInsts(
1850     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1851     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1852     std::list<std::list<CombineInfo>> &MergeableInsts) const {
1853   bool Modified = false;
1854 
1855   // Sort potential mergeable instructions into lists.  One list per base address.
1856   unsigned Order = 0;
1857   MachineBasicBlock::iterator BlockI = Begin;
1858   for (; BlockI != End; ++BlockI) {
1859     MachineInstr &MI = *BlockI;
1860 
1861     // We run this before checking if an address is mergeable, because it can produce
1862     // better code even if the instructions aren't mergeable.
1863     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1864       Modified = true;
1865 
1866     // Treat volatile accesses, ordered accesses and unmodeled side effects as
1867     // barriers. We can look after this barrier for separate merges.
1868     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
1869       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
1870 
1871       // Search will resume after this instruction in a separate merge list.
1872       ++BlockI;
1873       break;
1874     }
1875 
1876     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1877     if (InstClass == UNKNOWN)
1878       continue;
1879 
1880     // Do not merge VMEM buffer instructions with "swizzled" bit set.
1881     int Swizzled =
1882         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
1883     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
1884       continue;
1885 
1886     CombineInfo CI;
1887     CI.setMI(MI, *this);
1888     CI.Order = Order++;
1889 
1890     if (!CI.hasMergeableAddress(*MRI))
1891       continue;
1892 
1893     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
1894       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
1895       //        operands. However we are reporting that ds_write2 shall have
1896       //        only VGPR data so that machine copy propagation does not
1897       //        create an illegal instruction with a VGPR and AGPR sources.
1898       //        Consequenctially if we create such instruction the verifier
1899       //        will complain.
1900       continue;
1901     }
1902 
1903     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
1904 
1905     addInstToMergeableList(CI, MergeableInsts);
1906   }
1907 
1908   // At this point we have lists of Mergeable instructions.
1909   //
1910   // Part 2: Sort lists by offset and then for each CombineInfo object in the
1911   // list try to find an instruction that can be merged with I.  If an instruction
1912   // is found, it is stored in the Paired field.  If no instructions are found, then
1913   // the CombineInfo object is deleted from the list.
1914 
1915   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
1916                                                    E = MergeableInsts.end(); I != E;) {
1917 
1918     std::list<CombineInfo> &MergeList = *I;
1919     if (MergeList.size() <= 1) {
1920       // This means we have found only one instruction with a given address
1921       // that can be merged, and we need at least 2 instructions to do a merge,
1922       // so this list can be discarded.
1923       I = MergeableInsts.erase(I);
1924       continue;
1925     }
1926 
1927     // Sort the lists by offsets, this way mergeable instructions will be
1928     // adjacent to each other in the list, which will make it easier to find
1929     // matches.
1930     MergeList.sort(
1931         [] (const CombineInfo &A, const CombineInfo &B) {
1932           return A.Offset < B.Offset;
1933         });
1934     ++I;
1935   }
1936 
1937   return std::make_pair(BlockI, Modified);
1938 }
1939 
1940 // Scan through looking for adjacent LDS operations with constant offsets from
1941 // the same base register. We rely on the scheduler to do the hard work of
1942 // clustering nearby loads, and assume these are all adjacent.
1943 bool SILoadStoreOptimizer::optimizeBlock(
1944                        std::list<std::list<CombineInfo> > &MergeableInsts) {
1945   bool Modified = false;
1946 
1947   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
1948                                                    E = MergeableInsts.end(); I != E;) {
1949     std::list<CombineInfo> &MergeList = *I;
1950 
1951     bool OptimizeListAgain = false;
1952     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1953       // We weren't able to make any changes, so delete the list so we don't
1954       // process the same instructions the next time we try to optimize this
1955       // block.
1956       I = MergeableInsts.erase(I);
1957       continue;
1958     }
1959 
1960     Modified = true;
1961 
1962     // We made changes, but also determined that there were no more optimization
1963     // opportunities, so we don't need to reprocess the list
1964     if (!OptimizeListAgain) {
1965       I = MergeableInsts.erase(I);
1966       continue;
1967     }
1968     OptimizeAgain = true;
1969   }
1970   return Modified;
1971 }
1972 
1973 bool
1974 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1975                                           std::list<CombineInfo> &MergeList,
1976                                           bool &OptimizeListAgain) {
1977   if (MergeList.empty())
1978     return false;
1979 
1980   bool Modified = false;
1981 
1982   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
1983        Next = std::next(I)) {
1984 
1985     auto First = I;
1986     auto Second = Next;
1987 
1988     if ((*First).Order > (*Second).Order)
1989       std::swap(First, Second);
1990     CombineInfo &CI = *First;
1991     CombineInfo &Paired = *Second;
1992 
1993     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
1994     if (!Where) {
1995       ++I;
1996       continue;
1997     }
1998 
1999     Modified = true;
2000 
2001     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2002 
2003     MachineBasicBlock::iterator NewMI;
2004     switch (CI.InstClass) {
2005     default:
2006       llvm_unreachable("unknown InstClass");
2007       break;
2008     case DS_READ:
2009       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2010       break;
2011     case DS_WRITE:
2012       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2013       break;
2014     case S_BUFFER_LOAD_IMM:
2015       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2016       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2017       break;
2018     case BUFFER_LOAD:
2019       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2020       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2021       break;
2022     case BUFFER_STORE:
2023       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2024       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2025       break;
2026     case MIMG:
2027       NewMI = mergeImagePair(CI, Paired, Where->I);
2028       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2029       break;
2030     case TBUFFER_LOAD:
2031       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2032       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2033       break;
2034     case TBUFFER_STORE:
2035       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2036       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2037       break;
2038     }
2039     CI.setMI(NewMI, *this);
2040     CI.Order = Where->Order;
2041     if (I == Second)
2042       I = Next;
2043 
2044     MergeList.erase(Second);
2045   }
2046 
2047   return Modified;
2048 }
2049 
2050 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2051   if (skipFunction(MF.getFunction()))
2052     return false;
2053 
2054   STM = &MF.getSubtarget<GCNSubtarget>();
2055   if (!STM->loadStoreOptEnabled())
2056     return false;
2057 
2058   TII = STM->getInstrInfo();
2059   TRI = &TII->getRegisterInfo();
2060 
2061   MRI = &MF.getRegInfo();
2062   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2063 
2064   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2065 
2066   bool Modified = false;
2067 
2068   // Contains the list of instructions for which constant offsets are being
2069   // promoted to the IMM. This is tracked for an entire block at time.
2070   SmallPtrSet<MachineInstr *, 4> AnchorList;
2071   MemInfoMap Visited;
2072 
2073   for (MachineBasicBlock &MBB : MF) {
2074     MachineBasicBlock::iterator SectionEnd;
2075     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2076          I = SectionEnd) {
2077       bool CollectModified;
2078       std::list<std::list<CombineInfo>> MergeableInsts;
2079 
2080       // First pass: Collect list of all instructions we know how to merge in a
2081       // subset of the block.
2082       std::tie(SectionEnd, CollectModified) =
2083           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2084 
2085       Modified |= CollectModified;
2086 
2087       do {
2088         OptimizeAgain = false;
2089         Modified |= optimizeBlock(MergeableInsts);
2090       } while (OptimizeAgain);
2091     }
2092 
2093     Visited.clear();
2094     AnchorList.clear();
2095   }
2096 
2097   return Modified;
2098 }
2099