1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82 };
83 
84 struct AddressRegs {
85   unsigned char NumVAddrs = 0;
86   bool SBase = false;
87   bool SRsrc = false;
88   bool SOffset = false;
89   bool VAddr = false;
90   bool Addr = false;
91   bool SSamp = false;
92 };
93 
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96 
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98   struct CombineInfo {
99     MachineBasicBlock::iterator I;
100     unsigned EltSize;
101     unsigned Offset;
102     unsigned Width;
103     unsigned Format;
104     unsigned BaseOff;
105     unsigned DMask;
106     InstClassEnum InstClass;
107     unsigned CPol = 0;
108     bool UseST64;
109     int AddrIdx[MaxAddressRegs];
110     const MachineOperand *AddrReg[MaxAddressRegs];
111     unsigned NumAddresses;
112     unsigned Order;
113 
114     bool hasSameBaseAddress(const MachineInstr &MI) {
115       for (unsigned i = 0; i < NumAddresses; i++) {
116         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117 
118         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121             return false;
122           }
123           continue;
124         }
125 
126         // Check same base pointer. Be careful of subregisters, which can occur
127         // with vectors of pointers.
128         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130          return false;
131         }
132       }
133       return true;
134     }
135 
136     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137       for (unsigned i = 0; i < NumAddresses; ++i) {
138         const MachineOperand *AddrOp = AddrReg[i];
139         // Immediates are always OK.
140         if (AddrOp->isImm())
141           continue;
142 
143         // Don't try to merge addresses that aren't either immediates or registers.
144         // TODO: Should be possible to merge FrameIndexes and maybe some other
145         // non-register
146         if (!AddrOp->isReg())
147           return false;
148 
149         // TODO: We should be able to merge physical reg addresses.
150         if (AddrOp->getReg().isPhysical())
151           return false;
152 
153         // If an address has only one use then there will be on other
154         // instructions with the same address, so we can't merge this one.
155         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156           return false;
157       }
158       return true;
159     }
160 
161     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
162   };
163 
164   struct BaseRegisters {
165     Register LoReg;
166     Register HiReg;
167 
168     unsigned LoSubReg = 0;
169     unsigned HiSubReg = 0;
170   };
171 
172   struct MemAddress {
173     BaseRegisters Base;
174     int64_t Offset = 0;
175   };
176 
177   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
178 
179 private:
180   const GCNSubtarget *STM = nullptr;
181   const SIInstrInfo *TII = nullptr;
182   const SIRegisterInfo *TRI = nullptr;
183   MachineRegisterInfo *MRI = nullptr;
184   AliasAnalysis *AA = nullptr;
185   bool OptimizeAgain;
186 
187   static bool dmasksCanBeCombined(const CombineInfo &CI,
188                                   const SIInstrInfo &TII,
189                                   const CombineInfo &Paired);
190   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
191                                    CombineInfo &Paired, bool Modify = false);
192   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
193                         const CombineInfo &Paired);
194   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
195   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
196                                                      const CombineInfo &Paired);
197   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
198                                                     const CombineInfo &Paired);
199   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
200 
201   bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
202                             SmallVectorImpl<MachineInstr *> &InstsToMove);
203 
204   unsigned read2Opcode(unsigned EltSize) const;
205   unsigned read2ST64Opcode(unsigned EltSize) const;
206   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
207                                              CombineInfo &Paired,
208                   const SmallVectorImpl<MachineInstr *> &InstsToMove);
209 
210   unsigned write2Opcode(unsigned EltSize) const;
211   unsigned write2ST64Opcode(unsigned EltSize) const;
212   MachineBasicBlock::iterator
213   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
214                   const SmallVectorImpl<MachineInstr *> &InstsToMove);
215   MachineBasicBlock::iterator
216   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
217                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
218   MachineBasicBlock::iterator
219   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
220                           const SmallVectorImpl<MachineInstr *> &InstsToMove);
221   MachineBasicBlock::iterator
222   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
223                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
224   MachineBasicBlock::iterator
225   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
226                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
227   MachineBasicBlock::iterator
228   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
229                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
230   MachineBasicBlock::iterator
231   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
232                         const SmallVectorImpl<MachineInstr *> &InstsToMove);
233 
234   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
235                            int32_t NewOffset) const;
236   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
237   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
238   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
239   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
240   /// Promotes constant offset to the immediate by adjusting the base. It
241   /// tries to use a base from the nearby instructions that allows it to have
242   /// a 13bit constant offset which gets promoted to the immediate.
243   bool promoteConstantOffsetToImm(MachineInstr &CI,
244                                   MemInfoMap &Visited,
245                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
246   void addInstToMergeableList(const CombineInfo &CI,
247                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
248 
249   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
250       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
251       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
252       std::list<std::list<CombineInfo>> &MergeableInsts) const;
253 
254 public:
255   static char ID;
256 
257   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
258     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
259   }
260 
261   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
262                                      bool &OptimizeListAgain);
263   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
264 
265   bool runOnMachineFunction(MachineFunction &MF) override;
266 
267   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
268 
269   void getAnalysisUsage(AnalysisUsage &AU) const override {
270     AU.setPreservesCFG();
271     AU.addRequired<AAResultsWrapperPass>();
272 
273     MachineFunctionPass::getAnalysisUsage(AU);
274   }
275 
276   MachineFunctionProperties getRequiredProperties() const override {
277     return MachineFunctionProperties()
278       .set(MachineFunctionProperties::Property::IsSSA);
279   }
280 };
281 
282 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
283   const unsigned Opc = MI.getOpcode();
284 
285   if (TII.isMUBUF(Opc)) {
286     // FIXME: Handle d16 correctly
287     return AMDGPU::getMUBUFElements(Opc);
288   }
289   if (TII.isMIMG(MI)) {
290     uint64_t DMaskImm =
291         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
292     return countPopulation(DMaskImm);
293   }
294   if (TII.isMTBUF(Opc)) {
295     return AMDGPU::getMTBUFElements(Opc);
296   }
297 
298   switch (Opc) {
299   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
300     return 1;
301   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
302     return 2;
303   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
304     return 4;
305   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
306     return 8;
307   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
308   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
309   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
310   case AMDGPU::DS_WRITE_B32_gfx9:
311     return 1;
312   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
313   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
314   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
315   case AMDGPU::DS_WRITE_B64_gfx9:
316     return 2;
317   default:
318     return 0;
319   }
320 }
321 
322 /// Maps instruction opcode to enum InstClassEnum.
323 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
324   switch (Opc) {
325   default:
326     if (TII.isMUBUF(Opc)) {
327       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
328       default:
329         return UNKNOWN;
330       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
331       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
332       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
333       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
334         return BUFFER_LOAD;
335       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
336       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
337       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
338       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
339         return BUFFER_STORE;
340       }
341     }
342     if (TII.isMIMG(Opc)) {
343       // Ignore instructions encoded without vaddr.
344       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
345           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
346         return UNKNOWN;
347       // Ignore BVH instructions
348       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
349         return UNKNOWN;
350       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
351       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
352           TII.isGather4(Opc))
353         return UNKNOWN;
354       return MIMG;
355     }
356     if (TII.isMTBUF(Opc)) {
357       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
358       default:
359         return UNKNOWN;
360       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
361       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
362       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
363       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
364         return TBUFFER_LOAD;
365       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
366       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
367       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
368       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
369         return TBUFFER_STORE;
370       }
371     }
372     return UNKNOWN;
373   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
375   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
377     return S_BUFFER_LOAD_IMM;
378   case AMDGPU::DS_READ_B32:
379   case AMDGPU::DS_READ_B32_gfx9:
380   case AMDGPU::DS_READ_B64:
381   case AMDGPU::DS_READ_B64_gfx9:
382     return DS_READ;
383   case AMDGPU::DS_WRITE_B32:
384   case AMDGPU::DS_WRITE_B32_gfx9:
385   case AMDGPU::DS_WRITE_B64:
386   case AMDGPU::DS_WRITE_B64_gfx9:
387     return DS_WRITE;
388   }
389 }
390 
391 /// Determines instruction subclass from opcode. Only instructions
392 /// of the same subclass can be merged together.
393 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
394   switch (Opc) {
395   default:
396     if (TII.isMUBUF(Opc))
397       return AMDGPU::getMUBUFBaseOpcode(Opc);
398     if (TII.isMIMG(Opc)) {
399       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
400       assert(Info);
401       return Info->BaseOpcode;
402     }
403     if (TII.isMTBUF(Opc))
404       return AMDGPU::getMTBUFBaseOpcode(Opc);
405     return -1;
406   case AMDGPU::DS_READ_B32:
407   case AMDGPU::DS_READ_B32_gfx9:
408   case AMDGPU::DS_READ_B64:
409   case AMDGPU::DS_READ_B64_gfx9:
410   case AMDGPU::DS_WRITE_B32:
411   case AMDGPU::DS_WRITE_B32_gfx9:
412   case AMDGPU::DS_WRITE_B64:
413   case AMDGPU::DS_WRITE_B64_gfx9:
414     return Opc;
415   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
416   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
417   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
418   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
419     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
420   }
421 }
422 
423 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
424   AddressRegs Result;
425 
426   if (TII.isMUBUF(Opc)) {
427     if (AMDGPU::getMUBUFHasVAddr(Opc))
428       Result.VAddr = true;
429     if (AMDGPU::getMUBUFHasSrsrc(Opc))
430       Result.SRsrc = true;
431     if (AMDGPU::getMUBUFHasSoffset(Opc))
432       Result.SOffset = true;
433 
434     return Result;
435   }
436 
437   if (TII.isMIMG(Opc)) {
438     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
439     if (VAddr0Idx >= 0) {
440       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
441       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
442     } else {
443       Result.VAddr = true;
444     }
445     Result.SRsrc = true;
446     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
447     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
448       Result.SSamp = true;
449 
450     return Result;
451   }
452   if (TII.isMTBUF(Opc)) {
453     if (AMDGPU::getMTBUFHasVAddr(Opc))
454       Result.VAddr = true;
455     if (AMDGPU::getMTBUFHasSrsrc(Opc))
456       Result.SRsrc = true;
457     if (AMDGPU::getMTBUFHasSoffset(Opc))
458       Result.SOffset = true;
459 
460     return Result;
461   }
462 
463   switch (Opc) {
464   default:
465     return Result;
466   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
467   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
468   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
469   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
470     Result.SBase = true;
471     return Result;
472   case AMDGPU::DS_READ_B32:
473   case AMDGPU::DS_READ_B64:
474   case AMDGPU::DS_READ_B32_gfx9:
475   case AMDGPU::DS_READ_B64_gfx9:
476   case AMDGPU::DS_WRITE_B32:
477   case AMDGPU::DS_WRITE_B64:
478   case AMDGPU::DS_WRITE_B32_gfx9:
479   case AMDGPU::DS_WRITE_B64_gfx9:
480     Result.Addr = true;
481     return Result;
482   }
483 }
484 
485 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
486                                               const SILoadStoreOptimizer &LSO) {
487   I = MI;
488   unsigned Opc = MI->getOpcode();
489   InstClass = getInstClass(Opc, *LSO.TII);
490 
491   if (InstClass == UNKNOWN)
492     return;
493 
494   switch (InstClass) {
495   case DS_READ:
496    EltSize =
497           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
498                                                                           : 4;
499    break;
500   case DS_WRITE:
501     EltSize =
502           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
503                                                                             : 4;
504     break;
505   case S_BUFFER_LOAD_IMM:
506     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
507     break;
508   default:
509     EltSize = 4;
510     break;
511   }
512 
513   if (InstClass == MIMG) {
514     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
515     // Offset is not considered for MIMG instructions.
516     Offset = 0;
517   } else {
518     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
519     Offset = I->getOperand(OffsetIdx).getImm();
520   }
521 
522   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
523     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
524 
525   Width = getOpcodeWidth(*I, *LSO.TII);
526 
527   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
528     Offset &= 0xffff;
529   } else if (InstClass != MIMG) {
530     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
531   }
532 
533   AddressRegs Regs = getRegs(Opc, *LSO.TII);
534 
535   NumAddresses = 0;
536   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
537     AddrIdx[NumAddresses++] =
538         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
539   if (Regs.Addr)
540     AddrIdx[NumAddresses++] =
541         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
542   if (Regs.SBase)
543     AddrIdx[NumAddresses++] =
544         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
545   if (Regs.SRsrc)
546     AddrIdx[NumAddresses++] =
547         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
548   if (Regs.SOffset)
549     AddrIdx[NumAddresses++] =
550         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
551   if (Regs.VAddr)
552     AddrIdx[NumAddresses++] =
553         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
554   if (Regs.SSamp)
555     AddrIdx[NumAddresses++] =
556         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
557   assert(NumAddresses <= MaxAddressRegs);
558 
559   for (unsigned J = 0; J < NumAddresses; J++)
560     AddrReg[J] = &I->getOperand(AddrIdx[J]);
561 }
562 
563 } // end anonymous namespace.
564 
565 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
566                       "SI Load Store Optimizer", false, false)
567 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
568 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
569                     false, false)
570 
571 char SILoadStoreOptimizer::ID = 0;
572 
573 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
574 
575 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
576   return new SILoadStoreOptimizer();
577 }
578 
579 static void moveInstsAfter(MachineBasicBlock::iterator I,
580                            ArrayRef<MachineInstr *> InstsToMove) {
581   MachineBasicBlock *MBB = I->getParent();
582   ++I;
583   for (MachineInstr *MI : InstsToMove) {
584     MI->removeFromParent();
585     MBB->insert(I, MI);
586   }
587 }
588 
589 static void addDefsUsesToList(const MachineInstr &MI,
590                               DenseSet<Register> &RegDefs,
591                               DenseSet<Register> &PhysRegUses) {
592   for (const MachineOperand &Op : MI.operands()) {
593     if (Op.isReg()) {
594       if (Op.isDef())
595         RegDefs.insert(Op.getReg());
596       else if (Op.readsReg() && Op.getReg().isPhysical())
597         PhysRegUses.insert(Op.getReg());
598     }
599   }
600 }
601 
602 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
603                                       MachineBasicBlock::iterator B,
604                                       AliasAnalysis *AA) {
605   // RAW or WAR - cannot reorder
606   // WAW - cannot reorder
607   // RAR - safe to reorder
608   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
609 }
610 
611 // Add MI and its defs to the lists if MI reads one of the defs that are
612 // already in the list. Returns true in that case.
613 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
614                                   DenseSet<Register> &PhysRegUses,
615                                   SmallVectorImpl<MachineInstr *> &Insts) {
616   for (MachineOperand &Use : MI.operands()) {
617     // If one of the defs is read, then there is a use of Def between I and the
618     // instruction that I will potentially be merged with. We will need to move
619     // this instruction after the merged instructions.
620     //
621     // Similarly, if there is a def which is read by an instruction that is to
622     // be moved for merging, then we need to move the def-instruction as well.
623     // This can only happen for physical registers such as M0; virtual
624     // registers are in SSA form.
625     if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
626                         (Use.isDef() && RegDefs.count(Use.getReg())) ||
627                         (Use.isDef() && Use.getReg().isPhysical() &&
628                          PhysRegUses.count(Use.getReg())))) {
629       Insts.push_back(&MI);
630       addDefsUsesToList(MI, RegDefs, PhysRegUses);
631       return true;
632     }
633   }
634 
635   return false;
636 }
637 
638 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
639                                     ArrayRef<MachineInstr *> InstsToMove,
640                                     AliasAnalysis *AA) {
641   assert(MemOp.mayLoadOrStore());
642 
643   for (MachineInstr *InstToMove : InstsToMove) {
644     if (!InstToMove->mayLoadOrStore())
645       continue;
646     if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
647       return false;
648   }
649   return true;
650 }
651 
652 // This function assumes that \p A and \p B have are identical except for
653 // size and offset, and they reference adjacent memory.
654 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
655                                                    const MachineMemOperand *A,
656                                                    const MachineMemOperand *B) {
657   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
658   unsigned Size = A->getSize() + B->getSize();
659   // This function adds the offset parameter to the existing offset for A,
660   // so we pass 0 here as the offset and then manually set it to the correct
661   // value after the call.
662   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
663   MMO->setOffset(MinOffset);
664   return MMO;
665 }
666 
667 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
668                                                const SIInstrInfo &TII,
669                                                const CombineInfo &Paired) {
670   assert(CI.InstClass == MIMG);
671 
672   // Ignore instructions with tfe/lwe set.
673   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
674   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
675 
676   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
677     return false;
678 
679   // Check other optional immediate operands for equality.
680   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
681                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
682                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
683 
684   for (auto op : OperandsToMatch) {
685     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
686     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
687       return false;
688     if (Idx != -1 &&
689         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
690       return false;
691   }
692 
693   // Check DMask for overlaps.
694   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
695   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
696 
697   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
698   if ((1u << AllowedBitsForMin) <= MinMask)
699     return false;
700 
701   return true;
702 }
703 
704 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
705                                        unsigned ComponentCount,
706                                        const GCNSubtarget &STI) {
707   if (ComponentCount > 4)
708     return 0;
709 
710   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
711       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
712   if (!OldFormatInfo)
713     return 0;
714 
715   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
716       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
717                                            ComponentCount,
718                                            OldFormatInfo->NumFormat, STI);
719 
720   if (!NewFormatInfo)
721     return 0;
722 
723   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
724          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
725 
726   return NewFormatInfo->Format;
727 }
728 
729 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
730 // highest power of two. Note that the result is well defined for all inputs
731 // including corner cases like:
732 // - if Lo == Hi, return that value
733 // - if Lo == 0, return 0 (even though the "- 1" below underflows
734 // - if Lo > Hi, return 0 (as if the range wrapped around)
735 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
736   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
737 }
738 
739 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
740                                                 const GCNSubtarget &STI,
741                                                 CombineInfo &Paired,
742                                                 bool Modify) {
743   assert(CI.InstClass != MIMG);
744 
745   // XXX - Would the same offset be OK? Is there any reason this would happen or
746   // be useful?
747   if (CI.Offset == Paired.Offset)
748     return false;
749 
750   // This won't be valid if the offset isn't aligned.
751   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
752     return false;
753 
754   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
755 
756     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
757         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
758     if (!Info0)
759       return false;
760     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
761         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
762     if (!Info1)
763       return false;
764 
765     if (Info0->BitsPerComp != Info1->BitsPerComp ||
766         Info0->NumFormat != Info1->NumFormat)
767       return false;
768 
769     // TODO: Should be possible to support more formats, but if format loads
770     // are not dword-aligned, the merged load might not be valid.
771     if (Info0->BitsPerComp != 32)
772       return false;
773 
774     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
775       return false;
776   }
777 
778   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
779   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
780   CI.UseST64 = false;
781   CI.BaseOff = 0;
782 
783   // Handle all non-DS instructions.
784   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
785     return (EltOffset0 + CI.Width == EltOffset1 ||
786             EltOffset1 + Paired.Width == EltOffset0) &&
787            CI.CPol == Paired.CPol &&
788            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
789   }
790 
791   // If the offset in elements doesn't fit in 8-bits, we might be able to use
792   // the stride 64 versions.
793   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
794       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
795     if (Modify) {
796       CI.Offset = EltOffset0 / 64;
797       Paired.Offset = EltOffset1 / 64;
798       CI.UseST64 = true;
799     }
800     return true;
801   }
802 
803   // Check if the new offsets fit in the reduced 8-bit range.
804   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
805     if (Modify) {
806       CI.Offset = EltOffset0;
807       Paired.Offset = EltOffset1;
808     }
809     return true;
810   }
811 
812   // Try to shift base address to decrease offsets.
813   uint32_t Min = std::min(EltOffset0, EltOffset1);
814   uint32_t Max = std::max(EltOffset0, EltOffset1);
815 
816   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
817   if (((Max - Min) & ~Mask) == 0) {
818     if (Modify) {
819       // From the range of values we could use for BaseOff, choose the one that
820       // is aligned to the highest power of two, to maximise the chance that
821       // the same offset can be reused for other load/store pairs.
822       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
823       // Copy the low bits of the offsets, so that when we adjust them by
824       // subtracting BaseOff they will be multiples of 64.
825       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
826       CI.BaseOff = BaseOff * CI.EltSize;
827       CI.Offset = (EltOffset0 - BaseOff) / 64;
828       Paired.Offset = (EltOffset1 - BaseOff) / 64;
829       CI.UseST64 = true;
830     }
831     return true;
832   }
833 
834   if (isUInt<8>(Max - Min)) {
835     if (Modify) {
836       // From the range of values we could use for BaseOff, choose the one that
837       // is aligned to the highest power of two, to maximise the chance that
838       // the same offset can be reused for other load/store pairs.
839       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
840       CI.BaseOff = BaseOff * CI.EltSize;
841       CI.Offset = EltOffset0 - BaseOff;
842       Paired.Offset = EltOffset1 - BaseOff;
843     }
844     return true;
845   }
846 
847   return false;
848 }
849 
850 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
851                                      const CombineInfo &CI,
852                                      const CombineInfo &Paired) {
853   const unsigned Width = (CI.Width + Paired.Width);
854   switch (CI.InstClass) {
855   default:
856     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
857   case S_BUFFER_LOAD_IMM:
858     switch (Width) {
859     default:
860       return false;
861     case 2:
862     case 4:
863     case 8:
864       return true;
865     }
866   }
867 }
868 
869 const TargetRegisterClass *
870 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
871   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
872     return TRI->getRegClassForReg(*MRI, Dst->getReg());
873   }
874   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
875     return TRI->getRegClassForReg(*MRI, Src->getReg());
876   }
877   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
878     return TRI->getRegClassForReg(*MRI, Src->getReg());
879   }
880   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
881     return TRI->getRegClassForReg(*MRI, Dst->getReg());
882   }
883   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
884     return TRI->getRegClassForReg(*MRI, Src->getReg());
885   }
886   return nullptr;
887 }
888 
889 /// This function assumes that CI comes before Paired in a basic block.
890 bool SILoadStoreOptimizer::checkAndPrepareMerge(
891     CombineInfo &CI, CombineInfo &Paired,
892     SmallVectorImpl<MachineInstr *> &InstsToMove) {
893 
894   // Check both offsets (or masks for MIMG) can be combined and fit in the
895   // reduced range.
896   if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
897     return false;
898 
899   if (CI.InstClass != MIMG &&
900       (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
901     return false;
902 
903   const unsigned Opc = CI.I->getOpcode();
904   const InstClassEnum InstClass = getInstClass(Opc, *TII);
905 
906   if (InstClass == UNKNOWN) {
907     return false;
908   }
909   const unsigned InstSubclass = getInstSubclass(Opc, *TII);
910 
911   DenseSet<Register> RegDefsToMove;
912   DenseSet<Register> PhysRegUsesToMove;
913   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
914 
915   const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
916   bool IsAGPR = TRI->hasAGPRs(DataRC);
917 
918   MachineBasicBlock::iterator E = std::next(Paired.I);
919   MachineBasicBlock::iterator MBBI = std::next(CI.I);
920   MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
921   for (; MBBI != E; ++MBBI) {
922 
923     if (MBBI == MBBE) {
924       // CombineInfo::Order is a hint on the instruction ordering within the
925       // basic block. This hint suggests that CI precedes Paired, which is
926       // true most of the time. However, moveInstsAfter() processing a
927       // previous list may have changed this order in a situation when it
928       // moves an instruction which exists in some other merge list.
929       // In this case it must be dependent.
930       return false;
931     }
932 
933     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
934         (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
935       // This is not a matching instruction, but we can keep looking as
936       // long as one of these conditions are met:
937       // 1. It is safe to move I down past MBBI.
938       // 2. It is safe to move MBBI down past the instruction that I will
939       //    be merged into.
940 
941       if (MBBI->hasUnmodeledSideEffects()) {
942         // We can't re-order this instruction with respect to other memory
943         // operations, so we fail both conditions mentioned above.
944         return false;
945       }
946 
947       if (MBBI->mayLoadOrStore() &&
948           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
949            !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
950         // We fail condition #1, but we may still be able to satisfy condition
951         // #2.  Add this instruction to the move list and then we will check
952         // if condition #2 holds once we have selected the matching instruction.
953         InstsToMove.push_back(&*MBBI);
954         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
955         continue;
956       }
957 
958       // When we match I with another DS instruction we will be moving I down
959       // to the location of the matched instruction any uses of I will need to
960       // be moved down as well.
961       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
962                             InstsToMove);
963       continue;
964     }
965 
966     // Handle a case like
967     //   DS_WRITE_B32 addr, v, idx0
968     //   w = DS_READ_B32 addr, idx0
969     //   DS_WRITE_B32 addr, f(w), idx1
970     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
971     // merging of the two writes.
972     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
973                               InstsToMove))
974       continue;
975 
976     if (&*MBBI == &*Paired.I) {
977       if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
978         return false;
979       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
980       //        operands. However we are reporting that ds_write2 shall have
981       //        only VGPR data so that machine copy propagation does not
982       //        create an illegal instruction with a VGPR and AGPR sources.
983       //        Consequenctially if we create such instruction the verifier
984       //        will complain.
985       if (IsAGPR && CI.InstClass == DS_WRITE)
986         return false;
987 
988       // We need to go through the list of instructions that we plan to
989       // move and make sure they are all safe to move down past the merged
990       // instruction.
991       if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
992 
993         // Call offsetsCanBeCombined with modify = true so that the offsets are
994         // correct for the new instruction.  This should return true, because
995         // this function should only be called on CombineInfo objects that
996         // have already been confirmed to be mergeable.
997         if (CI.InstClass != MIMG)
998           offsetsCanBeCombined(CI, *STM, Paired, true);
999         return true;
1000       }
1001       return false;
1002     }
1003 
1004     // We've found a load/store that we couldn't merge for some reason.
1005     // We could potentially keep looking, but we'd need to make sure that
1006     // it was safe to move I and also all the instruction in InstsToMove
1007     // down past this instruction.
1008     // check if we can move I across MBBI and if we can move all I's users
1009     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1010         !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1011       break;
1012   }
1013   return false;
1014 }
1015 
1016 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1017   if (STM->ldsRequiresM0Init())
1018     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1019   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1020 }
1021 
1022 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1023   if (STM->ldsRequiresM0Init())
1024     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1025 
1026   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1027                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1028 }
1029 
1030 MachineBasicBlock::iterator
1031 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1032     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1033   MachineBasicBlock *MBB = CI.I->getParent();
1034 
1035   // Be careful, since the addresses could be subregisters themselves in weird
1036   // cases, like vectors of pointers.
1037   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1038 
1039   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1040   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1041 
1042   unsigned NewOffset0 = CI.Offset;
1043   unsigned NewOffset1 = Paired.Offset;
1044   unsigned Opc =
1045       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1046 
1047   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1048   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1049 
1050   if (NewOffset0 > NewOffset1) {
1051     // Canonicalize the merged instruction so the smaller offset comes first.
1052     std::swap(NewOffset0, NewOffset1);
1053     std::swap(SubRegIdx0, SubRegIdx1);
1054   }
1055 
1056   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1057          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1058 
1059   const MCInstrDesc &Read2Desc = TII->get(Opc);
1060 
1061   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1062   Register DestReg = MRI->createVirtualRegister(SuperRC);
1063 
1064   DebugLoc DL = CI.I->getDebugLoc();
1065 
1066   Register BaseReg = AddrReg->getReg();
1067   unsigned BaseSubReg = AddrReg->getSubReg();
1068   unsigned BaseRegFlags = 0;
1069   if (CI.BaseOff) {
1070     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1071     BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1072         .addImm(CI.BaseOff);
1073 
1074     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1075     BaseRegFlags = RegState::Kill;
1076 
1077     TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1078         .addReg(ImmReg)
1079         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1080         .addImm(0); // clamp bit
1081     BaseSubReg = 0;
1082   }
1083 
1084   MachineInstrBuilder Read2 =
1085       BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1086           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1087           .addImm(NewOffset0)                        // offset0
1088           .addImm(NewOffset1)                        // offset1
1089           .addImm(0)                                 // gds
1090           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1091 
1092   (void)Read2;
1093 
1094   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1095 
1096   // Copy to the old destination registers.
1097   BuildMI(*MBB, Paired.I, DL, CopyDesc)
1098       .add(*Dest0) // Copy to same destination including flags and sub reg.
1099       .addReg(DestReg, 0, SubRegIdx0);
1100   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1101                             .add(*Dest1)
1102                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1103 
1104   moveInstsAfter(Copy1, InstsToMove);
1105 
1106   CI.I->eraseFromParent();
1107   Paired.I->eraseFromParent();
1108 
1109   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1110   return Read2;
1111 }
1112 
1113 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1114   if (STM->ldsRequiresM0Init())
1115     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1116   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1117                         : AMDGPU::DS_WRITE2_B64_gfx9;
1118 }
1119 
1120 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1121   if (STM->ldsRequiresM0Init())
1122     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1123                           : AMDGPU::DS_WRITE2ST64_B64;
1124 
1125   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1126                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1127 }
1128 
1129 MachineBasicBlock::iterator
1130 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1131                                       const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1132   MachineBasicBlock *MBB = CI.I->getParent();
1133 
1134   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1135   // sure we preserve the subregister index and any register flags set on them.
1136   const MachineOperand *AddrReg =
1137       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1138   const MachineOperand *Data0 =
1139       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1140   const MachineOperand *Data1 =
1141       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1142 
1143   unsigned NewOffset0 = CI.Offset;
1144   unsigned NewOffset1 = Paired.Offset;
1145   unsigned Opc =
1146       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1147 
1148   if (NewOffset0 > NewOffset1) {
1149     // Canonicalize the merged instruction so the smaller offset comes first.
1150     std::swap(NewOffset0, NewOffset1);
1151     std::swap(Data0, Data1);
1152   }
1153 
1154   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1155          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1156 
1157   const MCInstrDesc &Write2Desc = TII->get(Opc);
1158   DebugLoc DL = CI.I->getDebugLoc();
1159 
1160   Register BaseReg = AddrReg->getReg();
1161   unsigned BaseSubReg = AddrReg->getSubReg();
1162   unsigned BaseRegFlags = 0;
1163   if (CI.BaseOff) {
1164     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1165     BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1166         .addImm(CI.BaseOff);
1167 
1168     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1169     BaseRegFlags = RegState::Kill;
1170 
1171     TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1172         .addReg(ImmReg)
1173         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1174         .addImm(0); // clamp bit
1175     BaseSubReg = 0;
1176   }
1177 
1178   MachineInstrBuilder Write2 =
1179       BuildMI(*MBB, Paired.I, DL, Write2Desc)
1180           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1181           .add(*Data0)                               // data0
1182           .add(*Data1)                               // data1
1183           .addImm(NewOffset0)                        // offset0
1184           .addImm(NewOffset1)                        // offset1
1185           .addImm(0)                                 // gds
1186           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1187 
1188   moveInstsAfter(Write2, InstsToMove);
1189 
1190   CI.I->eraseFromParent();
1191   Paired.I->eraseFromParent();
1192 
1193   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1194   return Write2;
1195 }
1196 
1197 MachineBasicBlock::iterator
1198 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1199                            const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1200   MachineBasicBlock *MBB = CI.I->getParent();
1201   DebugLoc DL = CI.I->getDebugLoc();
1202   const unsigned Opcode = getNewOpcode(CI, Paired);
1203 
1204   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1205 
1206   Register DestReg = MRI->createVirtualRegister(SuperRC);
1207   unsigned MergedDMask = CI.DMask | Paired.DMask;
1208   unsigned DMaskIdx =
1209       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1210 
1211   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1212   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1213     if (I == DMaskIdx)
1214       MIB.addImm(MergedDMask);
1215     else
1216       MIB.add((*CI.I).getOperand(I));
1217   }
1218 
1219   // It shouldn't be possible to get this far if the two instructions
1220   // don't have a single memoperand, because MachineInstr::mayAlias()
1221   // will return true if this is the case.
1222   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1223 
1224   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1225   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1226 
1227   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1228 
1229   unsigned SubRegIdx0, SubRegIdx1;
1230   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1231 
1232   // Copy to the old destination registers.
1233   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1234   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1235   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1236 
1237   BuildMI(*MBB, Paired.I, DL, CopyDesc)
1238       .add(*Dest0) // Copy to same destination including flags and sub reg.
1239       .addReg(DestReg, 0, SubRegIdx0);
1240   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1241                             .add(*Dest1)
1242                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1243 
1244   moveInstsAfter(Copy1, InstsToMove);
1245 
1246   CI.I->eraseFromParent();
1247   Paired.I->eraseFromParent();
1248   return New;
1249 }
1250 
1251 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1252     CombineInfo &CI, CombineInfo &Paired,
1253     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1254   MachineBasicBlock *MBB = CI.I->getParent();
1255   DebugLoc DL = CI.I->getDebugLoc();
1256   const unsigned Opcode = getNewOpcode(CI, Paired);
1257 
1258   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1259 
1260   Register DestReg = MRI->createVirtualRegister(SuperRC);
1261   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1262 
1263   // It shouldn't be possible to get this far if the two instructions
1264   // don't have a single memoperand, because MachineInstr::mayAlias()
1265   // will return true if this is the case.
1266   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1267 
1268   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1269   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1270 
1271   MachineInstr *New =
1272     BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1273         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1274         .addImm(MergedOffset) // offset
1275         .addImm(CI.CPol)      // cpol
1276         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1277 
1278   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1279   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1280   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1281 
1282   // Copy to the old destination registers.
1283   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1284   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1285   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1286 
1287   BuildMI(*MBB, Paired.I, DL, CopyDesc)
1288       .add(*Dest0) // Copy to same destination including flags and sub reg.
1289       .addReg(DestReg, 0, SubRegIdx0);
1290   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1291                             .add(*Dest1)
1292                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1293 
1294   moveInstsAfter(Copy1, InstsToMove);
1295 
1296   CI.I->eraseFromParent();
1297   Paired.I->eraseFromParent();
1298   return New;
1299 }
1300 
1301 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1302     CombineInfo &CI, CombineInfo &Paired,
1303     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1304   MachineBasicBlock *MBB = CI.I->getParent();
1305   DebugLoc DL = CI.I->getDebugLoc();
1306 
1307   const unsigned Opcode = getNewOpcode(CI, Paired);
1308 
1309   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1310 
1311   // Copy to the new source register.
1312   Register DestReg = MRI->createVirtualRegister(SuperRC);
1313   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1314 
1315   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1316 
1317   AddressRegs Regs = getRegs(Opcode, *TII);
1318 
1319   if (Regs.VAddr)
1320     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1321 
1322   // It shouldn't be possible to get this far if the two instructions
1323   // don't have a single memoperand, because MachineInstr::mayAlias()
1324   // will return true if this is the case.
1325   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1326 
1327   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1328   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1329 
1330   MachineInstr *New =
1331     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1332         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1333         .addImm(MergedOffset) // offset
1334         .addImm(CI.CPol)      // cpol
1335         .addImm(0)            // tfe
1336         .addImm(0)            // swz
1337         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1338 
1339   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1340   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1341   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1342 
1343   // Copy to the old destination registers.
1344   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1345   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1346   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1347 
1348   BuildMI(*MBB, Paired.I, DL, CopyDesc)
1349       .add(*Dest0) // Copy to same destination including flags and sub reg.
1350       .addReg(DestReg, 0, SubRegIdx0);
1351   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1352                             .add(*Dest1)
1353                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1354 
1355   moveInstsAfter(Copy1, InstsToMove);
1356 
1357   CI.I->eraseFromParent();
1358   Paired.I->eraseFromParent();
1359   return New;
1360 }
1361 
1362 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1363     CombineInfo &CI, CombineInfo &Paired,
1364     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1365   MachineBasicBlock *MBB = CI.I->getParent();
1366   DebugLoc DL = CI.I->getDebugLoc();
1367 
1368   const unsigned Opcode = getNewOpcode(CI, Paired);
1369 
1370   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1371 
1372   // Copy to the new source register.
1373   Register DestReg = MRI->createVirtualRegister(SuperRC);
1374   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1375 
1376   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1377 
1378   AddressRegs Regs = getRegs(Opcode, *TII);
1379 
1380   if (Regs.VAddr)
1381     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1382 
1383   unsigned JoinedFormat =
1384       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1385 
1386   // It shouldn't be possible to get this far if the two instructions
1387   // don't have a single memoperand, because MachineInstr::mayAlias()
1388   // will return true if this is the case.
1389   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1390 
1391   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1392   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1393 
1394   MachineInstr *New =
1395       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1396           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1397           .addImm(MergedOffset) // offset
1398           .addImm(JoinedFormat) // format
1399           .addImm(CI.CPol)      // cpol
1400           .addImm(0)            // tfe
1401           .addImm(0)            // swz
1402           .addMemOperand(
1403               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1404 
1405   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1406   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1407   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1408 
1409   // Copy to the old destination registers.
1410   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1411   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1412   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1413 
1414   BuildMI(*MBB, Paired.I, DL, CopyDesc)
1415       .add(*Dest0) // Copy to same destination including flags and sub reg.
1416       .addReg(DestReg, 0, SubRegIdx0);
1417   MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1418                             .add(*Dest1)
1419                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1420 
1421   moveInstsAfter(Copy1, InstsToMove);
1422 
1423   CI.I->eraseFromParent();
1424   Paired.I->eraseFromParent();
1425   return New;
1426 }
1427 
1428 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1429     CombineInfo &CI, CombineInfo &Paired,
1430     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1431   MachineBasicBlock *MBB = CI.I->getParent();
1432   DebugLoc DL = CI.I->getDebugLoc();
1433 
1434   const unsigned Opcode = getNewOpcode(CI, Paired);
1435 
1436   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1437   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1438   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1439 
1440   // Copy to the new source register.
1441   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1442   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1443 
1444   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1445   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1446 
1447   BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1448       .add(*Src0)
1449       .addImm(SubRegIdx0)
1450       .add(*Src1)
1451       .addImm(SubRegIdx1);
1452 
1453   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1454                  .addReg(SrcReg, RegState::Kill);
1455 
1456   AddressRegs Regs = getRegs(Opcode, *TII);
1457 
1458   if (Regs.VAddr)
1459     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1460 
1461   unsigned JoinedFormat =
1462       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1463 
1464   // It shouldn't be possible to get this far if the two instructions
1465   // don't have a single memoperand, because MachineInstr::mayAlias()
1466   // will return true if this is the case.
1467   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1468 
1469   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1470   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1471 
1472   MachineInstr *New =
1473       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1474           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1475           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1476           .addImm(JoinedFormat)                     // format
1477           .addImm(CI.CPol)                          // cpol
1478           .addImm(0)                                // tfe
1479           .addImm(0)                                // swz
1480           .addMemOperand(
1481               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1482 
1483   moveInstsAfter(MIB, InstsToMove);
1484 
1485   CI.I->eraseFromParent();
1486   Paired.I->eraseFromParent();
1487   return New;
1488 }
1489 
1490 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1491                                             const CombineInfo &Paired) {
1492   const unsigned Width = CI.Width + Paired.Width;
1493 
1494   switch (CI.InstClass) {
1495   default:
1496     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1497     // FIXME: Handle d16 correctly
1498     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1499                                   Width);
1500   case TBUFFER_LOAD:
1501   case TBUFFER_STORE:
1502     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1503                                   Width);
1504 
1505   case UNKNOWN:
1506     llvm_unreachable("Unknown instruction class");
1507   case S_BUFFER_LOAD_IMM:
1508     switch (Width) {
1509     default:
1510       return 0;
1511     case 2:
1512       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1513     case 4:
1514       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1515     case 8:
1516       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1517     }
1518   case MIMG:
1519     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1520            "No overlaps");
1521     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1522   }
1523 }
1524 
1525 std::pair<unsigned, unsigned>
1526 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1527                                     const CombineInfo &Paired) {
1528   bool ReverseOrder;
1529   if (CI.InstClass == MIMG) {
1530     assert(
1531         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1532         "No overlaps");
1533     ReverseOrder = CI.DMask > Paired.DMask;
1534   } else {
1535     ReverseOrder = CI.Offset > Paired.Offset;
1536   }
1537 
1538   unsigned Idx0;
1539   unsigned Idx1;
1540 
1541   static const unsigned Idxs[5][4] = {
1542       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1543       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1544       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1545       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1546       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1547   };
1548 
1549   assert(CI.Width >= 1 && CI.Width <= 4);
1550   assert(Paired.Width >= 1 && Paired.Width <= 4);
1551 
1552   if (ReverseOrder) {
1553     Idx1 = Idxs[0][Paired.Width - 1];
1554     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1555   } else {
1556     Idx0 = Idxs[0][CI.Width - 1];
1557     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1558   }
1559 
1560   return std::make_pair(Idx0, Idx1);
1561 }
1562 
1563 const TargetRegisterClass *
1564 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1565                                              const CombineInfo &Paired) {
1566   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1567     switch (CI.Width + Paired.Width) {
1568     default:
1569       return nullptr;
1570     case 2:
1571       return &AMDGPU::SReg_64_XEXECRegClass;
1572     case 4:
1573       return &AMDGPU::SGPR_128RegClass;
1574     case 8:
1575       return &AMDGPU::SGPR_256RegClass;
1576     case 16:
1577       return &AMDGPU::SGPR_512RegClass;
1578     }
1579   }
1580 
1581   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1582   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1583              ? TRI->getAGPRClassForBitWidth(BitWidth)
1584              : TRI->getVGPRClassForBitWidth(BitWidth);
1585 }
1586 
1587 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1588     CombineInfo &CI, CombineInfo &Paired,
1589     const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1590   MachineBasicBlock *MBB = CI.I->getParent();
1591   DebugLoc DL = CI.I->getDebugLoc();
1592 
1593   const unsigned Opcode = getNewOpcode(CI, Paired);
1594 
1595   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1596   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1597   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1598 
1599   // Copy to the new source register.
1600   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1601   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1602 
1603   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1604   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1605 
1606   BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1607       .add(*Src0)
1608       .addImm(SubRegIdx0)
1609       .add(*Src1)
1610       .addImm(SubRegIdx1);
1611 
1612   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1613                  .addReg(SrcReg, RegState::Kill);
1614 
1615   AddressRegs Regs = getRegs(Opcode, *TII);
1616 
1617   if (Regs.VAddr)
1618     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1619 
1620 
1621   // It shouldn't be possible to get this far if the two instructions
1622   // don't have a single memoperand, because MachineInstr::mayAlias()
1623   // will return true if this is the case.
1624   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1625 
1626   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1627   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1628 
1629   MachineInstr *New =
1630     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1631         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1632         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1633         .addImm(CI.CPol)      // cpol
1634         .addImm(0)            // tfe
1635         .addImm(0)            // swz
1636         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1637 
1638   moveInstsAfter(MIB, InstsToMove);
1639 
1640   CI.I->eraseFromParent();
1641   Paired.I->eraseFromParent();
1642   return New;
1643 }
1644 
1645 MachineOperand
1646 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1647   APInt V(32, Val, true);
1648   if (TII->isInlineConstant(V))
1649     return MachineOperand::CreateImm(Val);
1650 
1651   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1652   MachineInstr *Mov =
1653   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1654           TII->get(AMDGPU::S_MOV_B32), Reg)
1655     .addImm(Val);
1656   (void)Mov;
1657   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1658   return MachineOperand::CreateReg(Reg, false);
1659 }
1660 
1661 // Compute base address using Addr and return the final register.
1662 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1663                                            const MemAddress &Addr) const {
1664   MachineBasicBlock *MBB = MI.getParent();
1665   MachineBasicBlock::iterator MBBI = MI.getIterator();
1666   DebugLoc DL = MI.getDebugLoc();
1667 
1668   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1669           Addr.Base.LoSubReg) &&
1670          "Expected 32-bit Base-Register-Low!!");
1671 
1672   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1673           Addr.Base.HiSubReg) &&
1674          "Expected 32-bit Base-Register-Hi!!");
1675 
1676   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1677   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1678   MachineOperand OffsetHi =
1679     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1680 
1681   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1682   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1683   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1684 
1685   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1686   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1687   MachineInstr *LoHalf =
1688     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1689       .addReg(CarryReg, RegState::Define)
1690       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1691       .add(OffsetLo)
1692       .addImm(0); // clamp bit
1693   (void)LoHalf;
1694   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1695 
1696   MachineInstr *HiHalf =
1697   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1698     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1699     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1700     .add(OffsetHi)
1701     .addReg(CarryReg, RegState::Kill)
1702     .addImm(0); // clamp bit
1703   (void)HiHalf;
1704   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1705 
1706   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1707   MachineInstr *FullBase =
1708     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1709       .addReg(DestSub0)
1710       .addImm(AMDGPU::sub0)
1711       .addReg(DestSub1)
1712       .addImm(AMDGPU::sub1);
1713   (void)FullBase;
1714   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1715 
1716   return FullDestReg;
1717 }
1718 
1719 // Update base and offset with the NewBase and NewOffset in MI.
1720 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1721                                                Register NewBase,
1722                                                int32_t NewOffset) const {
1723   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1724   Base->setReg(NewBase);
1725   Base->setIsKill(false);
1726   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1727 }
1728 
1729 Optional<int32_t>
1730 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1731   if (Op.isImm())
1732     return Op.getImm();
1733 
1734   if (!Op.isReg())
1735     return None;
1736 
1737   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1738   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1739       !Def->getOperand(1).isImm())
1740     return None;
1741 
1742   return Def->getOperand(1).getImm();
1743 }
1744 
1745 // Analyze Base and extracts:
1746 //  - 32bit base registers, subregisters
1747 //  - 64bit constant offset
1748 // Expecting base computation as:
1749 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1750 //   %LO:vgpr_32, %c:sreg_64_xexec =
1751 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1752 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1753 //   %Base:vreg_64 =
1754 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1755 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1756                                                       MemAddress &Addr) const {
1757   if (!Base.isReg())
1758     return;
1759 
1760   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1761   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1762       || Def->getNumOperands() != 5)
1763     return;
1764 
1765   MachineOperand BaseLo = Def->getOperand(1);
1766   MachineOperand BaseHi = Def->getOperand(3);
1767   if (!BaseLo.isReg() || !BaseHi.isReg())
1768     return;
1769 
1770   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1771   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1772 
1773   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1774       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1775     return;
1776 
1777   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1778   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1779 
1780   auto Offset0P = extractConstOffset(*Src0);
1781   if (Offset0P)
1782     BaseLo = *Src1;
1783   else {
1784     if (!(Offset0P = extractConstOffset(*Src1)))
1785       return;
1786     BaseLo = *Src0;
1787   }
1788 
1789   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1790   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1791 
1792   if (Src0->isImm())
1793     std::swap(Src0, Src1);
1794 
1795   if (!Src1->isImm())
1796     return;
1797 
1798   uint64_t Offset1 = Src1->getImm();
1799   BaseHi = *Src0;
1800 
1801   Addr.Base.LoReg = BaseLo.getReg();
1802   Addr.Base.HiReg = BaseHi.getReg();
1803   Addr.Base.LoSubReg = BaseLo.getSubReg();
1804   Addr.Base.HiSubReg = BaseHi.getSubReg();
1805   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1806 }
1807 
1808 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1809     MachineInstr &MI,
1810     MemInfoMap &Visited,
1811     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1812 
1813   if (!(MI.mayLoad() ^ MI.mayStore()))
1814     return false;
1815 
1816   // TODO: Support flat and scratch.
1817   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1818     return false;
1819 
1820   if (MI.mayLoad() &&
1821       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1822     return false;
1823 
1824   if (AnchorList.count(&MI))
1825     return false;
1826 
1827   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1828 
1829   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1830     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1831     return false;
1832   }
1833 
1834   // Step1: Find the base-registers and a 64bit constant offset.
1835   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1836   MemAddress MAddr;
1837   if (Visited.find(&MI) == Visited.end()) {
1838     processBaseWithConstOffset(Base, MAddr);
1839     Visited[&MI] = MAddr;
1840   } else
1841     MAddr = Visited[&MI];
1842 
1843   if (MAddr.Offset == 0) {
1844     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1845                          " constant offsets that can be promoted.\n";);
1846     return false;
1847   }
1848 
1849   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1850              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1851 
1852   // Step2: Traverse through MI's basic block and find an anchor(that has the
1853   // same base-registers) with the highest 13bit distance from MI's offset.
1854   // E.g. (64bit loads)
1855   // bb:
1856   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1857   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1858   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1859   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1860   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1861   //
1862   // Starting from the first load, the optimization will try to find a new base
1863   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1864   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1865   // as the new-base(anchor) because of the maximum distance which can
1866   // accomodate more intermediate bases presumeably.
1867   //
1868   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1869   // (&a + 8192) for load1, load2, load4.
1870   //   addr = &a + 8192
1871   //   load1 = load(addr,       -4096)
1872   //   load2 = load(addr,       -2048)
1873   //   load3 = load(addr,       0)
1874   //   load4 = load(addr,       2048)
1875   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1876   //
1877   MachineInstr *AnchorInst = nullptr;
1878   MemAddress AnchorAddr;
1879   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1880   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1881 
1882   MachineBasicBlock *MBB = MI.getParent();
1883   MachineBasicBlock::iterator E = MBB->end();
1884   MachineBasicBlock::iterator MBBI = MI.getIterator();
1885   ++MBBI;
1886   const SITargetLowering *TLI =
1887     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1888 
1889   for ( ; MBBI != E; ++MBBI) {
1890     MachineInstr &MINext = *MBBI;
1891     // TODO: Support finding an anchor(with same base) from store addresses or
1892     // any other load addresses where the opcodes are different.
1893     if (MINext.getOpcode() != MI.getOpcode() ||
1894         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1895       continue;
1896 
1897     const MachineOperand &BaseNext =
1898       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1899     MemAddress MAddrNext;
1900     if (Visited.find(&MINext) == Visited.end()) {
1901       processBaseWithConstOffset(BaseNext, MAddrNext);
1902       Visited[&MINext] = MAddrNext;
1903     } else
1904       MAddrNext = Visited[&MINext];
1905 
1906     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1907         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1908         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1909         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1910       continue;
1911 
1912     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1913 
1914     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1915     TargetLoweringBase::AddrMode AM;
1916     AM.HasBaseReg = true;
1917     AM.BaseOffs = Dist;
1918     if (TLI->isLegalGlobalAddressingMode(AM) &&
1919         (uint32_t)std::abs(Dist) > MaxDist) {
1920       MaxDist = std::abs(Dist);
1921 
1922       AnchorAddr = MAddrNext;
1923       AnchorInst = &MINext;
1924     }
1925   }
1926 
1927   if (AnchorInst) {
1928     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1929                AnchorInst->dump());
1930     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1931                <<  AnchorAddr.Offset << "\n\n");
1932 
1933     // Instead of moving up, just re-compute anchor-instruction's base address.
1934     Register Base = computeBase(MI, AnchorAddr);
1935 
1936     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1937     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1938 
1939     for (auto P : InstsWCommonBase) {
1940       TargetLoweringBase::AddrMode AM;
1941       AM.HasBaseReg = true;
1942       AM.BaseOffs = P.second - AnchorAddr.Offset;
1943 
1944       if (TLI->isLegalGlobalAddressingMode(AM)) {
1945         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1946                    dbgs() << ")"; P.first->dump());
1947         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1948         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1949       }
1950     }
1951     AnchorList.insert(AnchorInst);
1952     return true;
1953   }
1954 
1955   return false;
1956 }
1957 
1958 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1959                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1960   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1961     if (AddrList.front().InstClass == CI.InstClass &&
1962         AddrList.front().hasSameBaseAddress(*CI.I)) {
1963       AddrList.emplace_back(CI);
1964       return;
1965     }
1966   }
1967 
1968   // Base address not found, so add a new list.
1969   MergeableInsts.emplace_back(1, CI);
1970 }
1971 
1972 std::pair<MachineBasicBlock::iterator, bool>
1973 SILoadStoreOptimizer::collectMergeableInsts(
1974     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1975     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1976     std::list<std::list<CombineInfo>> &MergeableInsts) const {
1977   bool Modified = false;
1978 
1979   // Sort potential mergeable instructions into lists.  One list per base address.
1980   unsigned Order = 0;
1981   MachineBasicBlock::iterator BlockI = Begin;
1982   for (; BlockI != End; ++BlockI) {
1983     MachineInstr &MI = *BlockI;
1984 
1985     // We run this before checking if an address is mergeable, because it can produce
1986     // better code even if the instructions aren't mergeable.
1987     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1988       Modified = true;
1989 
1990     // Don't combine if volatile. We also won't be able to merge across this, so
1991     // break the search. We can look after this barrier for separate merges.
1992     if (MI.hasOrderedMemoryRef()) {
1993       LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
1994 
1995       // Search will resume after this instruction in a separate merge list.
1996       ++BlockI;
1997       break;
1998     }
1999 
2000     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2001     if (InstClass == UNKNOWN)
2002       continue;
2003 
2004     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2005     int Swizzled =
2006         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2007     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2008       continue;
2009 
2010     CombineInfo CI;
2011     CI.setMI(MI, *this);
2012     CI.Order = Order++;
2013 
2014     if (!CI.hasMergeableAddress(*MRI))
2015       continue;
2016 
2017     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2018 
2019     addInstToMergeableList(CI, MergeableInsts);
2020   }
2021 
2022   // At this point we have lists of Mergeable instructions.
2023   //
2024   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2025   // list try to find an instruction that can be merged with I.  If an instruction
2026   // is found, it is stored in the Paired field.  If no instructions are found, then
2027   // the CombineInfo object is deleted from the list.
2028 
2029   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2030                                                    E = MergeableInsts.end(); I != E;) {
2031 
2032     std::list<CombineInfo> &MergeList = *I;
2033     if (MergeList.size() <= 1) {
2034       // This means we have found only one instruction with a given address
2035       // that can be merged, and we need at least 2 instructions to do a merge,
2036       // so this list can be discarded.
2037       I = MergeableInsts.erase(I);
2038       continue;
2039     }
2040 
2041     // Sort the lists by offsets, this way mergeable instructions will be
2042     // adjacent to each other in the list, which will make it easier to find
2043     // matches.
2044     MergeList.sort(
2045         [] (const CombineInfo &A, const CombineInfo &B) {
2046           return A.Offset < B.Offset;
2047         });
2048     ++I;
2049   }
2050 
2051   return std::make_pair(BlockI, Modified);
2052 }
2053 
2054 // Scan through looking for adjacent LDS operations with constant offsets from
2055 // the same base register. We rely on the scheduler to do the hard work of
2056 // clustering nearby loads, and assume these are all adjacent.
2057 bool SILoadStoreOptimizer::optimizeBlock(
2058                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2059   bool Modified = false;
2060 
2061   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2062                                                    E = MergeableInsts.end(); I != E;) {
2063     std::list<CombineInfo> &MergeList = *I;
2064 
2065     bool OptimizeListAgain = false;
2066     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2067       // We weren't able to make any changes, so delete the list so we don't
2068       // process the same instructions the next time we try to optimize this
2069       // block.
2070       I = MergeableInsts.erase(I);
2071       continue;
2072     }
2073 
2074     Modified = true;
2075 
2076     // We made changes, but also determined that there were no more optimization
2077     // opportunities, so we don't need to reprocess the list
2078     if (!OptimizeListAgain) {
2079       I = MergeableInsts.erase(I);
2080       continue;
2081     }
2082     OptimizeAgain = true;
2083   }
2084   return Modified;
2085 }
2086 
2087 bool
2088 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2089                                           std::list<CombineInfo> &MergeList,
2090                                           bool &OptimizeListAgain) {
2091   if (MergeList.empty())
2092     return false;
2093 
2094   bool Modified = false;
2095 
2096   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2097        Next = std::next(I)) {
2098 
2099     auto First = I;
2100     auto Second = Next;
2101 
2102     if ((*First).Order > (*Second).Order)
2103       std::swap(First, Second);
2104     CombineInfo &CI = *First;
2105     CombineInfo &Paired = *Second;
2106 
2107     SmallVector<MachineInstr *, 8> InstsToMove;
2108     if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2109       ++I;
2110       continue;
2111     }
2112 
2113     Modified = true;
2114 
2115     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2116 
2117     switch (CI.InstClass) {
2118     default:
2119       llvm_unreachable("unknown InstClass");
2120       break;
2121     case DS_READ: {
2122       MachineBasicBlock::iterator NewMI =
2123           mergeRead2Pair(CI, Paired, InstsToMove);
2124       CI.setMI(NewMI, *this);
2125       break;
2126     }
2127     case DS_WRITE: {
2128       MachineBasicBlock::iterator NewMI =
2129           mergeWrite2Pair(CI, Paired, InstsToMove);
2130       CI.setMI(NewMI, *this);
2131       break;
2132     }
2133     case S_BUFFER_LOAD_IMM: {
2134       MachineBasicBlock::iterator NewMI =
2135           mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2136       CI.setMI(NewMI, *this);
2137       OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
2138       break;
2139     }
2140     case BUFFER_LOAD: {
2141       MachineBasicBlock::iterator NewMI =
2142           mergeBufferLoadPair(CI, Paired, InstsToMove);
2143       CI.setMI(NewMI, *this);
2144       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2145       break;
2146     }
2147     case BUFFER_STORE: {
2148       MachineBasicBlock::iterator NewMI =
2149           mergeBufferStorePair(CI, Paired, InstsToMove);
2150       CI.setMI(NewMI, *this);
2151       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2152       break;
2153     }
2154     case MIMG: {
2155       MachineBasicBlock::iterator NewMI =
2156           mergeImagePair(CI, Paired, InstsToMove);
2157       CI.setMI(NewMI, *this);
2158       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2159       break;
2160     }
2161     case TBUFFER_LOAD: {
2162       MachineBasicBlock::iterator NewMI =
2163           mergeTBufferLoadPair(CI, Paired, InstsToMove);
2164       CI.setMI(NewMI, *this);
2165       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2166       break;
2167     }
2168     case TBUFFER_STORE: {
2169       MachineBasicBlock::iterator NewMI =
2170           mergeTBufferStorePair(CI, Paired, InstsToMove);
2171       CI.setMI(NewMI, *this);
2172       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2173       break;
2174     }
2175     }
2176     CI.Order = Paired.Order;
2177     if (I == Second)
2178       I = Next;
2179 
2180     MergeList.erase(Second);
2181   }
2182 
2183   return Modified;
2184 }
2185 
2186 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2187   if (skipFunction(MF.getFunction()))
2188     return false;
2189 
2190   STM = &MF.getSubtarget<GCNSubtarget>();
2191   if (!STM->loadStoreOptEnabled())
2192     return false;
2193 
2194   TII = STM->getInstrInfo();
2195   TRI = &TII->getRegisterInfo();
2196 
2197   MRI = &MF.getRegInfo();
2198   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2199 
2200   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2201 
2202   bool Modified = false;
2203 
2204   // Contains the list of instructions for which constant offsets are being
2205   // promoted to the IMM. This is tracked for an entire block at time.
2206   SmallPtrSet<MachineInstr *, 4> AnchorList;
2207   MemInfoMap Visited;
2208 
2209   for (MachineBasicBlock &MBB : MF) {
2210     MachineBasicBlock::iterator SectionEnd;
2211     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2212          I = SectionEnd) {
2213       bool CollectModified;
2214       std::list<std::list<CombineInfo>> MergeableInsts;
2215 
2216       // First pass: Collect list of all instructions we know how to merge in a
2217       // subset of the block.
2218       std::tie(SectionEnd, CollectModified) =
2219           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2220 
2221       Modified |= CollectModified;
2222 
2223       do {
2224         OptimizeAgain = false;
2225         Modified |= optimizeBlock(MergeableInsts);
2226       } while (OptimizeAgain);
2227     }
2228 
2229     Visited.clear();
2230     AnchorList.clear();
2231   }
2232 
2233   return Modified;
2234 }
2235