1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82 };
83 
84 struct AddressRegs {
85   unsigned char NumVAddrs = 0;
86   bool SBase = false;
87   bool SRsrc = false;
88   bool SOffset = false;
89   bool VAddr = false;
90   bool Addr = false;
91   bool SSamp = false;
92 };
93 
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96 
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98   struct CombineInfo {
99     MachineBasicBlock::iterator I;
100     unsigned EltSize;
101     unsigned Offset;
102     unsigned Width;
103     unsigned Format;
104     unsigned BaseOff;
105     unsigned DMask;
106     InstClassEnum InstClass;
107     unsigned CPol = 0;
108     bool IsAGPR;
109     bool UseST64;
110     int AddrIdx[MaxAddressRegs];
111     const MachineOperand *AddrReg[MaxAddressRegs];
112     unsigned NumAddresses;
113     unsigned Order;
114 
115     bool hasSameBaseAddress(const MachineInstr &MI) {
116       for (unsigned i = 0; i < NumAddresses; i++) {
117         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
118 
119         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
120           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
121               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
122             return false;
123           }
124           continue;
125         }
126 
127         // Check same base pointer. Be careful of subregisters, which can occur
128         // with vectors of pointers.
129         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
130             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
131          return false;
132         }
133       }
134       return true;
135     }
136 
137     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
138       for (unsigned i = 0; i < NumAddresses; ++i) {
139         const MachineOperand *AddrOp = AddrReg[i];
140         // Immediates are always OK.
141         if (AddrOp->isImm())
142           continue;
143 
144         // Don't try to merge addresses that aren't either immediates or registers.
145         // TODO: Should be possible to merge FrameIndexes and maybe some other
146         // non-register
147         if (!AddrOp->isReg())
148           return false;
149 
150         // TODO: We should be able to merge physical reg addresses.
151         if (AddrOp->getReg().isPhysical())
152           return false;
153 
154         // If an address has only one use then there will be on other
155         // instructions with the same address, so we can't merge this one.
156         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
157           return false;
158       }
159       return true;
160     }
161 
162     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
163   };
164 
165   struct BaseRegisters {
166     Register LoReg;
167     Register HiReg;
168 
169     unsigned LoSubReg = 0;
170     unsigned HiSubReg = 0;
171   };
172 
173   struct MemAddress {
174     BaseRegisters Base;
175     int64_t Offset = 0;
176   };
177 
178   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179 
180 private:
181   const GCNSubtarget *STM = nullptr;
182   const SIInstrInfo *TII = nullptr;
183   const SIRegisterInfo *TRI = nullptr;
184   MachineRegisterInfo *MRI = nullptr;
185   AliasAnalysis *AA = nullptr;
186   bool OptimizeAgain;
187 
188   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
189                            const DenseSet<Register> &ARegUses,
190                            const MachineInstr &A, const MachineInstr &B) const;
191   static bool dmasksCanBeCombined(const CombineInfo &CI,
192                                   const SIInstrInfo &TII,
193                                   const CombineInfo &Paired);
194   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
195                                    CombineInfo &Paired, bool Modify = false);
196   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
197                         const CombineInfo &Paired);
198   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
199   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
200                                                      const CombineInfo &Paired);
201   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
202                                                     const CombineInfo &Paired);
203   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
204 
205   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
206 
207   unsigned read2Opcode(unsigned EltSize) const;
208   unsigned read2ST64Opcode(unsigned EltSize) const;
209   MachineBasicBlock::iterator
210   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
211                  MachineBasicBlock::iterator InsertBefore);
212 
213   unsigned write2Opcode(unsigned EltSize) const;
214   unsigned write2ST64Opcode(unsigned EltSize) const;
215   MachineBasicBlock::iterator
216   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
217                   MachineBasicBlock::iterator InsertBefore);
218   MachineBasicBlock::iterator
219   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
220                  MachineBasicBlock::iterator InsertBefore);
221   MachineBasicBlock::iterator
222   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
223                           MachineBasicBlock::iterator InsertBefore);
224   MachineBasicBlock::iterator
225   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
226                       MachineBasicBlock::iterator InsertBefore);
227   MachineBasicBlock::iterator
228   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
229                        MachineBasicBlock::iterator InsertBefore);
230   MachineBasicBlock::iterator
231   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
232                        MachineBasicBlock::iterator InsertBefore);
233   MachineBasicBlock::iterator
234   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
235                         MachineBasicBlock::iterator InsertBefore);
236 
237   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
238                            int32_t NewOffset) const;
239   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
240   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
241   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
242   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
243   /// Promotes constant offset to the immediate by adjusting the base. It
244   /// tries to use a base from the nearby instructions that allows it to have
245   /// a 13bit constant offset which gets promoted to the immediate.
246   bool promoteConstantOffsetToImm(MachineInstr &CI,
247                                   MemInfoMap &Visited,
248                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
249   void addInstToMergeableList(const CombineInfo &CI,
250                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
251 
252   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
253       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
254       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
255       std::list<std::list<CombineInfo>> &MergeableInsts) const;
256 
257 public:
258   static char ID;
259 
260   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
261     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
262   }
263 
264   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
265                                      bool &OptimizeListAgain);
266   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
267 
268   bool runOnMachineFunction(MachineFunction &MF) override;
269 
270   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
271 
272   void getAnalysisUsage(AnalysisUsage &AU) const override {
273     AU.setPreservesCFG();
274     AU.addRequired<AAResultsWrapperPass>();
275 
276     MachineFunctionPass::getAnalysisUsage(AU);
277   }
278 
279   MachineFunctionProperties getRequiredProperties() const override {
280     return MachineFunctionProperties()
281       .set(MachineFunctionProperties::Property::IsSSA);
282   }
283 };
284 
285 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
286   const unsigned Opc = MI.getOpcode();
287 
288   if (TII.isMUBUF(Opc)) {
289     // FIXME: Handle d16 correctly
290     return AMDGPU::getMUBUFElements(Opc);
291   }
292   if (TII.isMIMG(MI)) {
293     uint64_t DMaskImm =
294         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
295     return countPopulation(DMaskImm);
296   }
297   if (TII.isMTBUF(Opc)) {
298     return AMDGPU::getMTBUFElements(Opc);
299   }
300 
301   switch (Opc) {
302   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
303     return 1;
304   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
305     return 2;
306   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
307     return 4;
308   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
309     return 8;
310   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
311   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
312   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
313   case AMDGPU::DS_WRITE_B32_gfx9:
314     return 1;
315   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
316   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
317   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
318   case AMDGPU::DS_WRITE_B64_gfx9:
319     return 2;
320   default:
321     return 0;
322   }
323 }
324 
325 /// Maps instruction opcode to enum InstClassEnum.
326 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
327   switch (Opc) {
328   default:
329     if (TII.isMUBUF(Opc)) {
330       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
331       default:
332         return UNKNOWN;
333       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
334       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
335       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
336       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
337         return BUFFER_LOAD;
338       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
339       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
340       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
341       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
342         return BUFFER_STORE;
343       }
344     }
345     if (TII.isMIMG(Opc)) {
346       // Ignore instructions encoded without vaddr.
347       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
348           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
349         return UNKNOWN;
350       // Ignore BVH instructions
351       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
352         return UNKNOWN;
353       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
354       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
355           TII.isGather4(Opc))
356         return UNKNOWN;
357       return MIMG;
358     }
359     if (TII.isMTBUF(Opc)) {
360       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
361       default:
362         return UNKNOWN;
363       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
364       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
365       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
366       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
367         return TBUFFER_LOAD;
368       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
369       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
370       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
371       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
372         return TBUFFER_STORE;
373       }
374     }
375     return UNKNOWN;
376   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
377   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
378   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
379   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
380     return S_BUFFER_LOAD_IMM;
381   case AMDGPU::DS_READ_B32:
382   case AMDGPU::DS_READ_B32_gfx9:
383   case AMDGPU::DS_READ_B64:
384   case AMDGPU::DS_READ_B64_gfx9:
385     return DS_READ;
386   case AMDGPU::DS_WRITE_B32:
387   case AMDGPU::DS_WRITE_B32_gfx9:
388   case AMDGPU::DS_WRITE_B64:
389   case AMDGPU::DS_WRITE_B64_gfx9:
390     return DS_WRITE;
391   }
392 }
393 
394 /// Determines instruction subclass from opcode. Only instructions
395 /// of the same subclass can be merged together. The merged instruction may have
396 /// a different subclass but must have the same class.
397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
398   switch (Opc) {
399   default:
400     if (TII.isMUBUF(Opc))
401       return AMDGPU::getMUBUFBaseOpcode(Opc);
402     if (TII.isMIMG(Opc)) {
403       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
404       assert(Info);
405       return Info->BaseOpcode;
406     }
407     if (TII.isMTBUF(Opc))
408       return AMDGPU::getMTBUFBaseOpcode(Opc);
409     return -1;
410   case AMDGPU::DS_READ_B32:
411   case AMDGPU::DS_READ_B32_gfx9:
412   case AMDGPU::DS_READ_B64:
413   case AMDGPU::DS_READ_B64_gfx9:
414   case AMDGPU::DS_WRITE_B32:
415   case AMDGPU::DS_WRITE_B32_gfx9:
416   case AMDGPU::DS_WRITE_B64:
417   case AMDGPU::DS_WRITE_B64_gfx9:
418     return Opc;
419   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
420   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
421   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
422   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
423     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
424   }
425 }
426 
427 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
428   AddressRegs Result;
429 
430   if (TII.isMUBUF(Opc)) {
431     if (AMDGPU::getMUBUFHasVAddr(Opc))
432       Result.VAddr = true;
433     if (AMDGPU::getMUBUFHasSrsrc(Opc))
434       Result.SRsrc = true;
435     if (AMDGPU::getMUBUFHasSoffset(Opc))
436       Result.SOffset = true;
437 
438     return Result;
439   }
440 
441   if (TII.isMIMG(Opc)) {
442     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
443     if (VAddr0Idx >= 0) {
444       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
445       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
446     } else {
447       Result.VAddr = true;
448     }
449     Result.SRsrc = true;
450     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
451     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
452       Result.SSamp = true;
453 
454     return Result;
455   }
456   if (TII.isMTBUF(Opc)) {
457     if (AMDGPU::getMTBUFHasVAddr(Opc))
458       Result.VAddr = true;
459     if (AMDGPU::getMTBUFHasSrsrc(Opc))
460       Result.SRsrc = true;
461     if (AMDGPU::getMTBUFHasSoffset(Opc))
462       Result.SOffset = true;
463 
464     return Result;
465   }
466 
467   switch (Opc) {
468   default:
469     return Result;
470   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
471   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
472   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474     Result.SBase = true;
475     return Result;
476   case AMDGPU::DS_READ_B32:
477   case AMDGPU::DS_READ_B64:
478   case AMDGPU::DS_READ_B32_gfx9:
479   case AMDGPU::DS_READ_B64_gfx9:
480   case AMDGPU::DS_WRITE_B32:
481   case AMDGPU::DS_WRITE_B64:
482   case AMDGPU::DS_WRITE_B32_gfx9:
483   case AMDGPU::DS_WRITE_B64_gfx9:
484     Result.Addr = true;
485     return Result;
486   }
487 }
488 
489 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
490                                               const SILoadStoreOptimizer &LSO) {
491   I = MI;
492   unsigned Opc = MI->getOpcode();
493   InstClass = getInstClass(Opc, *LSO.TII);
494 
495   if (InstClass == UNKNOWN)
496     return;
497 
498   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
499 
500   switch (InstClass) {
501   case DS_READ:
502    EltSize =
503           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
504                                                                           : 4;
505    break;
506   case DS_WRITE:
507     EltSize =
508           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
509                                                                             : 4;
510     break;
511   case S_BUFFER_LOAD_IMM:
512     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
513     break;
514   default:
515     EltSize = 4;
516     break;
517   }
518 
519   if (InstClass == MIMG) {
520     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
521     // Offset is not considered for MIMG instructions.
522     Offset = 0;
523   } else {
524     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
525     Offset = I->getOperand(OffsetIdx).getImm();
526   }
527 
528   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
529     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
530 
531   Width = getOpcodeWidth(*I, *LSO.TII);
532 
533   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
534     Offset &= 0xffff;
535   } else if (InstClass != MIMG) {
536     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
537   }
538 
539   AddressRegs Regs = getRegs(Opc, *LSO.TII);
540 
541   NumAddresses = 0;
542   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
543     AddrIdx[NumAddresses++] =
544         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
545   if (Regs.Addr)
546     AddrIdx[NumAddresses++] =
547         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
548   if (Regs.SBase)
549     AddrIdx[NumAddresses++] =
550         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
551   if (Regs.SRsrc)
552     AddrIdx[NumAddresses++] =
553         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
554   if (Regs.SOffset)
555     AddrIdx[NumAddresses++] =
556         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
557   if (Regs.VAddr)
558     AddrIdx[NumAddresses++] =
559         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
560   if (Regs.SSamp)
561     AddrIdx[NumAddresses++] =
562         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
563   assert(NumAddresses <= MaxAddressRegs);
564 
565   for (unsigned J = 0; J < NumAddresses; J++)
566     AddrReg[J] = &I->getOperand(AddrIdx[J]);
567 }
568 
569 } // end anonymous namespace.
570 
571 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
572                       "SI Load Store Optimizer", false, false)
573 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
574 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
575                     false, false)
576 
577 char SILoadStoreOptimizer::ID = 0;
578 
579 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
580 
581 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
582   return new SILoadStoreOptimizer();
583 }
584 
585 static void addDefsUsesToList(const MachineInstr &MI,
586                               DenseSet<Register> &RegDefs,
587                               DenseSet<Register> &RegUses) {
588   for (const auto &Op : MI.operands()) {
589     if (!Op.isReg())
590       continue;
591     if (Op.isDef())
592       RegDefs.insert(Op.getReg());
593     if (Op.readsReg())
594       RegUses.insert(Op.getReg());
595   }
596 }
597 
598 bool SILoadStoreOptimizer::canSwapInstructions(
599     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
600     const MachineInstr &A, const MachineInstr &B) const {
601   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
602       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
603     return false;
604   for (const auto &BOp : B.operands()) {
605     if (!BOp.isReg())
606       continue;
607     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
608       return false;
609     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
610       return false;
611   }
612   return true;
613 }
614 
615 // This function assumes that \p A and \p B have are identical except for
616 // size and offset, and they reference adjacent memory.
617 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
618                                                    const MachineMemOperand *A,
619                                                    const MachineMemOperand *B) {
620   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
621   unsigned Size = A->getSize() + B->getSize();
622   // This function adds the offset parameter to the existing offset for A,
623   // so we pass 0 here as the offset and then manually set it to the correct
624   // value after the call.
625   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
626   MMO->setOffset(MinOffset);
627   return MMO;
628 }
629 
630 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
631                                                const SIInstrInfo &TII,
632                                                const CombineInfo &Paired) {
633   assert(CI.InstClass == MIMG);
634 
635   // Ignore instructions with tfe/lwe set.
636   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
637   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
638 
639   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
640     return false;
641 
642   // Check other optional immediate operands for equality.
643   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
644                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
645                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
646 
647   for (auto op : OperandsToMatch) {
648     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
649     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
650       return false;
651     if (Idx != -1 &&
652         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
653       return false;
654   }
655 
656   // Check DMask for overlaps.
657   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
658   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
659 
660   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
661   if ((1u << AllowedBitsForMin) <= MinMask)
662     return false;
663 
664   return true;
665 }
666 
667 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
668                                        unsigned ComponentCount,
669                                        const GCNSubtarget &STI) {
670   if (ComponentCount > 4)
671     return 0;
672 
673   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
674       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
675   if (!OldFormatInfo)
676     return 0;
677 
678   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
679       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
680                                            ComponentCount,
681                                            OldFormatInfo->NumFormat, STI);
682 
683   if (!NewFormatInfo)
684     return 0;
685 
686   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
687          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
688 
689   return NewFormatInfo->Format;
690 }
691 
692 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
693 // highest power of two. Note that the result is well defined for all inputs
694 // including corner cases like:
695 // - if Lo == Hi, return that value
696 // - if Lo == 0, return 0 (even though the "- 1" below underflows
697 // - if Lo > Hi, return 0 (as if the range wrapped around)
698 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
699   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
700 }
701 
702 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
703                                                 const GCNSubtarget &STI,
704                                                 CombineInfo &Paired,
705                                                 bool Modify) {
706   assert(CI.InstClass != MIMG);
707 
708   // XXX - Would the same offset be OK? Is there any reason this would happen or
709   // be useful?
710   if (CI.Offset == Paired.Offset)
711     return false;
712 
713   // This won't be valid if the offset isn't aligned.
714   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
715     return false;
716 
717   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
718 
719     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
720         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
721     if (!Info0)
722       return false;
723     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
724         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
725     if (!Info1)
726       return false;
727 
728     if (Info0->BitsPerComp != Info1->BitsPerComp ||
729         Info0->NumFormat != Info1->NumFormat)
730       return false;
731 
732     // TODO: Should be possible to support more formats, but if format loads
733     // are not dword-aligned, the merged load might not be valid.
734     if (Info0->BitsPerComp != 32)
735       return false;
736 
737     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
738       return false;
739   }
740 
741   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
742   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
743   CI.UseST64 = false;
744   CI.BaseOff = 0;
745 
746   // Handle all non-DS instructions.
747   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
748     return (EltOffset0 + CI.Width == EltOffset1 ||
749             EltOffset1 + Paired.Width == EltOffset0) &&
750            CI.CPol == Paired.CPol &&
751            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
752   }
753 
754   // If the offset in elements doesn't fit in 8-bits, we might be able to use
755   // the stride 64 versions.
756   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
757       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
758     if (Modify) {
759       CI.Offset = EltOffset0 / 64;
760       Paired.Offset = EltOffset1 / 64;
761       CI.UseST64 = true;
762     }
763     return true;
764   }
765 
766   // Check if the new offsets fit in the reduced 8-bit range.
767   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
768     if (Modify) {
769       CI.Offset = EltOffset0;
770       Paired.Offset = EltOffset1;
771     }
772     return true;
773   }
774 
775   // Try to shift base address to decrease offsets.
776   uint32_t Min = std::min(EltOffset0, EltOffset1);
777   uint32_t Max = std::max(EltOffset0, EltOffset1);
778 
779   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
780   if (((Max - Min) & ~Mask) == 0) {
781     if (Modify) {
782       // From the range of values we could use for BaseOff, choose the one that
783       // is aligned to the highest power of two, to maximise the chance that
784       // the same offset can be reused for other load/store pairs.
785       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
786       // Copy the low bits of the offsets, so that when we adjust them by
787       // subtracting BaseOff they will be multiples of 64.
788       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
789       CI.BaseOff = BaseOff * CI.EltSize;
790       CI.Offset = (EltOffset0 - BaseOff) / 64;
791       Paired.Offset = (EltOffset1 - BaseOff) / 64;
792       CI.UseST64 = true;
793     }
794     return true;
795   }
796 
797   if (isUInt<8>(Max - Min)) {
798     if (Modify) {
799       // From the range of values we could use for BaseOff, choose the one that
800       // is aligned to the highest power of two, to maximise the chance that
801       // the same offset can be reused for other load/store pairs.
802       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
803       CI.BaseOff = BaseOff * CI.EltSize;
804       CI.Offset = EltOffset0 - BaseOff;
805       Paired.Offset = EltOffset1 - BaseOff;
806     }
807     return true;
808   }
809 
810   return false;
811 }
812 
813 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
814                                      const CombineInfo &CI,
815                                      const CombineInfo &Paired) {
816   const unsigned Width = (CI.Width + Paired.Width);
817   switch (CI.InstClass) {
818   default:
819     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
820   case S_BUFFER_LOAD_IMM:
821     switch (Width) {
822     default:
823       return false;
824     case 2:
825     case 4:
826     case 8:
827       return true;
828     }
829   }
830 }
831 
832 const TargetRegisterClass *
833 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
834   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
835     return TRI->getRegClassForReg(*MRI, Dst->getReg());
836   }
837   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
838     return TRI->getRegClassForReg(*MRI, Src->getReg());
839   }
840   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
841     return TRI->getRegClassForReg(*MRI, Src->getReg());
842   }
843   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
844     return TRI->getRegClassForReg(*MRI, Dst->getReg());
845   }
846   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
847     return TRI->getRegClassForReg(*MRI, Src->getReg());
848   }
849   return nullptr;
850 }
851 
852 /// This function assumes that CI comes before Paired in a basic block. Return
853 /// an insertion point for the merged instruction or nullptr on failure.
854 SILoadStoreOptimizer::CombineInfo *
855 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
856                                            CombineInfo &Paired) {
857   // If another instruction has already been merged into CI, it may now be a
858   // type that we can't do any further merging into.
859   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
860     return nullptr;
861   assert(CI.InstClass == Paired.InstClass);
862 
863   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
864       getInstSubclass(Paired.I->getOpcode(), *TII))
865     return nullptr;
866 
867   // Check both offsets (or masks for MIMG) can be combined and fit in the
868   // reduced range.
869   if (CI.InstClass == MIMG) {
870     if (!dmasksCanBeCombined(CI, *TII, Paired))
871       return nullptr;
872   } else {
873     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
874       return nullptr;
875   }
876 
877   DenseSet<Register> RegDefs;
878   DenseSet<Register> RegUses;
879   CombineInfo *Where;
880   if (CI.I->mayLoad()) {
881     // Try to hoist Paired up to CI.
882     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
883     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
884       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
885         return nullptr;
886     }
887     Where = &CI;
888   } else {
889     // Try to sink CI down to Paired.
890     addDefsUsesToList(*CI.I, RegDefs, RegUses);
891     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
892       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
893         return nullptr;
894     }
895     Where = &Paired;
896   }
897 
898   // Call offsetsCanBeCombined with modify = true so that the offsets are
899   // correct for the new instruction.  This should return true, because
900   // this function should only be called on CombineInfo objects that
901   // have already been confirmed to be mergeable.
902   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
903     offsetsCanBeCombined(CI, *STM, Paired, true);
904   return Where;
905 }
906 
907 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
908   if (STM->ldsRequiresM0Init())
909     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
910   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
911 }
912 
913 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
914   if (STM->ldsRequiresM0Init())
915     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
916 
917   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
918                         : AMDGPU::DS_READ2ST64_B64_gfx9;
919 }
920 
921 MachineBasicBlock::iterator
922 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
923                                      MachineBasicBlock::iterator InsertBefore) {
924   MachineBasicBlock *MBB = CI.I->getParent();
925 
926   // Be careful, since the addresses could be subregisters themselves in weird
927   // cases, like vectors of pointers.
928   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
929 
930   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
931   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
932 
933   unsigned NewOffset0 = CI.Offset;
934   unsigned NewOffset1 = Paired.Offset;
935   unsigned Opc =
936       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
937 
938   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
939   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
940 
941   if (NewOffset0 > NewOffset1) {
942     // Canonicalize the merged instruction so the smaller offset comes first.
943     std::swap(NewOffset0, NewOffset1);
944     std::swap(SubRegIdx0, SubRegIdx1);
945   }
946 
947   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
948          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
949 
950   const MCInstrDesc &Read2Desc = TII->get(Opc);
951 
952   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
953   Register DestReg = MRI->createVirtualRegister(SuperRC);
954 
955   DebugLoc DL = CI.I->getDebugLoc();
956 
957   Register BaseReg = AddrReg->getReg();
958   unsigned BaseSubReg = AddrReg->getSubReg();
959   unsigned BaseRegFlags = 0;
960   if (CI.BaseOff) {
961     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
962     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
963         .addImm(CI.BaseOff);
964 
965     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
966     BaseRegFlags = RegState::Kill;
967 
968     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
969         .addReg(ImmReg)
970         .addReg(AddrReg->getReg(), 0, BaseSubReg)
971         .addImm(0); // clamp bit
972     BaseSubReg = 0;
973   }
974 
975   MachineInstrBuilder Read2 =
976       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
977           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
978           .addImm(NewOffset0)                        // offset0
979           .addImm(NewOffset1)                        // offset1
980           .addImm(0)                                 // gds
981           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
982 
983   (void)Read2;
984 
985   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
986 
987   // Copy to the old destination registers.
988   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
989       .add(*Dest0) // Copy to same destination including flags and sub reg.
990       .addReg(DestReg, 0, SubRegIdx0);
991   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
992       .add(*Dest1)
993       .addReg(DestReg, RegState::Kill, SubRegIdx1);
994 
995   CI.I->eraseFromParent();
996   Paired.I->eraseFromParent();
997 
998   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
999   return Read2;
1000 }
1001 
1002 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1003   if (STM->ldsRequiresM0Init())
1004     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1005   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1006                         : AMDGPU::DS_WRITE2_B64_gfx9;
1007 }
1008 
1009 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1010   if (STM->ldsRequiresM0Init())
1011     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1012                           : AMDGPU::DS_WRITE2ST64_B64;
1013 
1014   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1015                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1016 }
1017 
1018 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1019     CombineInfo &CI, CombineInfo &Paired,
1020     MachineBasicBlock::iterator InsertBefore) {
1021   MachineBasicBlock *MBB = CI.I->getParent();
1022 
1023   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1024   // sure we preserve the subregister index and any register flags set on them.
1025   const MachineOperand *AddrReg =
1026       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1027   const MachineOperand *Data0 =
1028       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1029   const MachineOperand *Data1 =
1030       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1031 
1032   unsigned NewOffset0 = CI.Offset;
1033   unsigned NewOffset1 = Paired.Offset;
1034   unsigned Opc =
1035       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1036 
1037   if (NewOffset0 > NewOffset1) {
1038     // Canonicalize the merged instruction so the smaller offset comes first.
1039     std::swap(NewOffset0, NewOffset1);
1040     std::swap(Data0, Data1);
1041   }
1042 
1043   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1044          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1045 
1046   const MCInstrDesc &Write2Desc = TII->get(Opc);
1047   DebugLoc DL = CI.I->getDebugLoc();
1048 
1049   Register BaseReg = AddrReg->getReg();
1050   unsigned BaseSubReg = AddrReg->getSubReg();
1051   unsigned BaseRegFlags = 0;
1052   if (CI.BaseOff) {
1053     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1054     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1055         .addImm(CI.BaseOff);
1056 
1057     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1058     BaseRegFlags = RegState::Kill;
1059 
1060     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1061         .addReg(ImmReg)
1062         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1063         .addImm(0); // clamp bit
1064     BaseSubReg = 0;
1065   }
1066 
1067   MachineInstrBuilder Write2 =
1068       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1069           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1070           .add(*Data0)                               // data0
1071           .add(*Data1)                               // data1
1072           .addImm(NewOffset0)                        // offset0
1073           .addImm(NewOffset1)                        // offset1
1074           .addImm(0)                                 // gds
1075           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1076 
1077   CI.I->eraseFromParent();
1078   Paired.I->eraseFromParent();
1079 
1080   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1081   return Write2;
1082 }
1083 
1084 MachineBasicBlock::iterator
1085 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1086                                      MachineBasicBlock::iterator InsertBefore) {
1087   MachineBasicBlock *MBB = CI.I->getParent();
1088   DebugLoc DL = CI.I->getDebugLoc();
1089   const unsigned Opcode = getNewOpcode(CI, Paired);
1090 
1091   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1092 
1093   Register DestReg = MRI->createVirtualRegister(SuperRC);
1094   unsigned MergedDMask = CI.DMask | Paired.DMask;
1095   unsigned DMaskIdx =
1096       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1097 
1098   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1099   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1100     if (I == DMaskIdx)
1101       MIB.addImm(MergedDMask);
1102     else
1103       MIB.add((*CI.I).getOperand(I));
1104   }
1105 
1106   // It shouldn't be possible to get this far if the two instructions
1107   // don't have a single memoperand, because MachineInstr::mayAlias()
1108   // will return true if this is the case.
1109   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1110 
1111   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1112   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1113 
1114   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1115 
1116   unsigned SubRegIdx0, SubRegIdx1;
1117   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1118 
1119   // Copy to the old destination registers.
1120   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1121   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1122   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1123 
1124   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1125       .add(*Dest0) // Copy to same destination including flags and sub reg.
1126       .addReg(DestReg, 0, SubRegIdx0);
1127   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1128       .add(*Dest1)
1129       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1130 
1131   CI.I->eraseFromParent();
1132   Paired.I->eraseFromParent();
1133   return New;
1134 }
1135 
1136 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1137     CombineInfo &CI, CombineInfo &Paired,
1138     MachineBasicBlock::iterator InsertBefore) {
1139   MachineBasicBlock *MBB = CI.I->getParent();
1140   DebugLoc DL = CI.I->getDebugLoc();
1141   const unsigned Opcode = getNewOpcode(CI, Paired);
1142 
1143   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1144 
1145   Register DestReg = MRI->createVirtualRegister(SuperRC);
1146   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1147 
1148   // It shouldn't be possible to get this far if the two instructions
1149   // don't have a single memoperand, because MachineInstr::mayAlias()
1150   // will return true if this is the case.
1151   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1152 
1153   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1154   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1155 
1156   MachineInstr *New =
1157       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1158           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1159           .addImm(MergedOffset) // offset
1160           .addImm(CI.CPol)      // cpol
1161           .addMemOperand(
1162               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1163 
1164   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1165   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1166   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1167 
1168   // Copy to the old destination registers.
1169   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1170   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1171   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1172 
1173   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1174       .add(*Dest0) // Copy to same destination including flags and sub reg.
1175       .addReg(DestReg, 0, SubRegIdx0);
1176   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1177       .add(*Dest1)
1178       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1179 
1180   CI.I->eraseFromParent();
1181   Paired.I->eraseFromParent();
1182   return New;
1183 }
1184 
1185 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1186     CombineInfo &CI, CombineInfo &Paired,
1187     MachineBasicBlock::iterator InsertBefore) {
1188   MachineBasicBlock *MBB = CI.I->getParent();
1189   DebugLoc DL = CI.I->getDebugLoc();
1190 
1191   const unsigned Opcode = getNewOpcode(CI, Paired);
1192 
1193   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1194 
1195   // Copy to the new source register.
1196   Register DestReg = MRI->createVirtualRegister(SuperRC);
1197   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1198 
1199   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1200 
1201   AddressRegs Regs = getRegs(Opcode, *TII);
1202 
1203   if (Regs.VAddr)
1204     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1205 
1206   // It shouldn't be possible to get this far if the two instructions
1207   // don't have a single memoperand, because MachineInstr::mayAlias()
1208   // will return true if this is the case.
1209   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1210 
1211   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1212   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1213 
1214   MachineInstr *New =
1215     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1216         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1217         .addImm(MergedOffset) // offset
1218         .addImm(CI.CPol)      // cpol
1219         .addImm(0)            // tfe
1220         .addImm(0)            // swz
1221         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1222 
1223   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1224   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1225   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1226 
1227   // Copy to the old destination registers.
1228   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1229   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1230   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1231 
1232   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1233       .add(*Dest0) // Copy to same destination including flags and sub reg.
1234       .addReg(DestReg, 0, SubRegIdx0);
1235   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1236       .add(*Dest1)
1237       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1238 
1239   CI.I->eraseFromParent();
1240   Paired.I->eraseFromParent();
1241   return New;
1242 }
1243 
1244 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1245     CombineInfo &CI, CombineInfo &Paired,
1246     MachineBasicBlock::iterator InsertBefore) {
1247   MachineBasicBlock *MBB = CI.I->getParent();
1248   DebugLoc DL = CI.I->getDebugLoc();
1249 
1250   const unsigned Opcode = getNewOpcode(CI, Paired);
1251 
1252   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1253 
1254   // Copy to the new source register.
1255   Register DestReg = MRI->createVirtualRegister(SuperRC);
1256   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1257 
1258   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1259 
1260   AddressRegs Regs = getRegs(Opcode, *TII);
1261 
1262   if (Regs.VAddr)
1263     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1264 
1265   unsigned JoinedFormat =
1266       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1267 
1268   // It shouldn't be possible to get this far if the two instructions
1269   // don't have a single memoperand, because MachineInstr::mayAlias()
1270   // will return true if this is the case.
1271   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1272 
1273   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1274   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1275 
1276   MachineInstr *New =
1277       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1278           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1279           .addImm(MergedOffset) // offset
1280           .addImm(JoinedFormat) // format
1281           .addImm(CI.CPol)      // cpol
1282           .addImm(0)            // tfe
1283           .addImm(0)            // swz
1284           .addMemOperand(
1285               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1286 
1287   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1288   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1289   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1290 
1291   // Copy to the old destination registers.
1292   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1293   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1294   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1295 
1296   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1297       .add(*Dest0) // Copy to same destination including flags and sub reg.
1298       .addReg(DestReg, 0, SubRegIdx0);
1299   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1300       .add(*Dest1)
1301       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1302 
1303   CI.I->eraseFromParent();
1304   Paired.I->eraseFromParent();
1305   return New;
1306 }
1307 
1308 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1309     CombineInfo &CI, CombineInfo &Paired,
1310     MachineBasicBlock::iterator InsertBefore) {
1311   MachineBasicBlock *MBB = CI.I->getParent();
1312   DebugLoc DL = CI.I->getDebugLoc();
1313 
1314   const unsigned Opcode = getNewOpcode(CI, Paired);
1315 
1316   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1317   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1318   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1319 
1320   // Copy to the new source register.
1321   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1322   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1323 
1324   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1325   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1326 
1327   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1328       .add(*Src0)
1329       .addImm(SubRegIdx0)
1330       .add(*Src1)
1331       .addImm(SubRegIdx1);
1332 
1333   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1334                  .addReg(SrcReg, RegState::Kill);
1335 
1336   AddressRegs Regs = getRegs(Opcode, *TII);
1337 
1338   if (Regs.VAddr)
1339     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1340 
1341   unsigned JoinedFormat =
1342       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1343 
1344   // It shouldn't be possible to get this far if the two instructions
1345   // don't have a single memoperand, because MachineInstr::mayAlias()
1346   // will return true if this is the case.
1347   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1348 
1349   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1350   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1351 
1352   MachineInstr *New =
1353       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1354           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1355           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1356           .addImm(JoinedFormat)                     // format
1357           .addImm(CI.CPol)                          // cpol
1358           .addImm(0)                                // tfe
1359           .addImm(0)                                // swz
1360           .addMemOperand(
1361               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1362 
1363   CI.I->eraseFromParent();
1364   Paired.I->eraseFromParent();
1365   return New;
1366 }
1367 
1368 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1369                                             const CombineInfo &Paired) {
1370   const unsigned Width = CI.Width + Paired.Width;
1371 
1372   switch (CI.InstClass) {
1373   default:
1374     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1375     // FIXME: Handle d16 correctly
1376     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1377                                   Width);
1378   case TBUFFER_LOAD:
1379   case TBUFFER_STORE:
1380     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1381                                   Width);
1382 
1383   case UNKNOWN:
1384     llvm_unreachable("Unknown instruction class");
1385   case S_BUFFER_LOAD_IMM:
1386     switch (Width) {
1387     default:
1388       return 0;
1389     case 2:
1390       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1391     case 4:
1392       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1393     case 8:
1394       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1395     }
1396   case MIMG:
1397     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1398            "No overlaps");
1399     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1400   }
1401 }
1402 
1403 std::pair<unsigned, unsigned>
1404 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1405                                     const CombineInfo &Paired) {
1406   bool ReverseOrder;
1407   if (CI.InstClass == MIMG) {
1408     assert(
1409         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1410         "No overlaps");
1411     ReverseOrder = CI.DMask > Paired.DMask;
1412   } else {
1413     ReverseOrder = CI.Offset > Paired.Offset;
1414   }
1415 
1416   unsigned Idx0;
1417   unsigned Idx1;
1418 
1419   static const unsigned Idxs[5][4] = {
1420       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1421       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1422       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1423       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1424       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1425   };
1426 
1427   assert(CI.Width >= 1 && CI.Width <= 4);
1428   assert(Paired.Width >= 1 && Paired.Width <= 4);
1429 
1430   if (ReverseOrder) {
1431     Idx1 = Idxs[0][Paired.Width - 1];
1432     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1433   } else {
1434     Idx0 = Idxs[0][CI.Width - 1];
1435     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1436   }
1437 
1438   return std::make_pair(Idx0, Idx1);
1439 }
1440 
1441 const TargetRegisterClass *
1442 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1443                                              const CombineInfo &Paired) {
1444   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1445     switch (CI.Width + Paired.Width) {
1446     default:
1447       return nullptr;
1448     case 2:
1449       return &AMDGPU::SReg_64_XEXECRegClass;
1450     case 4:
1451       return &AMDGPU::SGPR_128RegClass;
1452     case 8:
1453       return &AMDGPU::SGPR_256RegClass;
1454     case 16:
1455       return &AMDGPU::SGPR_512RegClass;
1456     }
1457   }
1458 
1459   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1460   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1461              ? TRI->getAGPRClassForBitWidth(BitWidth)
1462              : TRI->getVGPRClassForBitWidth(BitWidth);
1463 }
1464 
1465 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1466     CombineInfo &CI, CombineInfo &Paired,
1467     MachineBasicBlock::iterator InsertBefore) {
1468   MachineBasicBlock *MBB = CI.I->getParent();
1469   DebugLoc DL = CI.I->getDebugLoc();
1470 
1471   const unsigned Opcode = getNewOpcode(CI, Paired);
1472 
1473   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1474   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1475   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1476 
1477   // Copy to the new source register.
1478   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1479   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1480 
1481   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1482   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1483 
1484   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1485       .add(*Src0)
1486       .addImm(SubRegIdx0)
1487       .add(*Src1)
1488       .addImm(SubRegIdx1);
1489 
1490   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1491                  .addReg(SrcReg, RegState::Kill);
1492 
1493   AddressRegs Regs = getRegs(Opcode, *TII);
1494 
1495   if (Regs.VAddr)
1496     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1497 
1498 
1499   // It shouldn't be possible to get this far if the two instructions
1500   // don't have a single memoperand, because MachineInstr::mayAlias()
1501   // will return true if this is the case.
1502   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1503 
1504   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1505   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1506 
1507   MachineInstr *New =
1508     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1509         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1510         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1511         .addImm(CI.CPol)      // cpol
1512         .addImm(0)            // tfe
1513         .addImm(0)            // swz
1514         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1515 
1516   CI.I->eraseFromParent();
1517   Paired.I->eraseFromParent();
1518   return New;
1519 }
1520 
1521 MachineOperand
1522 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1523   APInt V(32, Val, true);
1524   if (TII->isInlineConstant(V))
1525     return MachineOperand::CreateImm(Val);
1526 
1527   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1528   MachineInstr *Mov =
1529   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1530           TII->get(AMDGPU::S_MOV_B32), Reg)
1531     .addImm(Val);
1532   (void)Mov;
1533   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1534   return MachineOperand::CreateReg(Reg, false);
1535 }
1536 
1537 // Compute base address using Addr and return the final register.
1538 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1539                                            const MemAddress &Addr) const {
1540   MachineBasicBlock *MBB = MI.getParent();
1541   MachineBasicBlock::iterator MBBI = MI.getIterator();
1542   DebugLoc DL = MI.getDebugLoc();
1543 
1544   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1545           Addr.Base.LoSubReg) &&
1546          "Expected 32-bit Base-Register-Low!!");
1547 
1548   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1549           Addr.Base.HiSubReg) &&
1550          "Expected 32-bit Base-Register-Hi!!");
1551 
1552   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1553   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1554   MachineOperand OffsetHi =
1555     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1556 
1557   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1558   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1559   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1560 
1561   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1562   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1563   MachineInstr *LoHalf =
1564     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1565       .addReg(CarryReg, RegState::Define)
1566       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1567       .add(OffsetLo)
1568       .addImm(0); // clamp bit
1569   (void)LoHalf;
1570   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1571 
1572   MachineInstr *HiHalf =
1573   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1574     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1575     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1576     .add(OffsetHi)
1577     .addReg(CarryReg, RegState::Kill)
1578     .addImm(0); // clamp bit
1579   (void)HiHalf;
1580   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1581 
1582   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1583   MachineInstr *FullBase =
1584     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1585       .addReg(DestSub0)
1586       .addImm(AMDGPU::sub0)
1587       .addReg(DestSub1)
1588       .addImm(AMDGPU::sub1);
1589   (void)FullBase;
1590   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1591 
1592   return FullDestReg;
1593 }
1594 
1595 // Update base and offset with the NewBase and NewOffset in MI.
1596 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1597                                                Register NewBase,
1598                                                int32_t NewOffset) const {
1599   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1600   Base->setReg(NewBase);
1601   Base->setIsKill(false);
1602   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1603 }
1604 
1605 Optional<int32_t>
1606 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1607   if (Op.isImm())
1608     return Op.getImm();
1609 
1610   if (!Op.isReg())
1611     return None;
1612 
1613   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1614   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1615       !Def->getOperand(1).isImm())
1616     return None;
1617 
1618   return Def->getOperand(1).getImm();
1619 }
1620 
1621 // Analyze Base and extracts:
1622 //  - 32bit base registers, subregisters
1623 //  - 64bit constant offset
1624 // Expecting base computation as:
1625 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1626 //   %LO:vgpr_32, %c:sreg_64_xexec =
1627 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1628 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1629 //   %Base:vreg_64 =
1630 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1631 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1632                                                       MemAddress &Addr) const {
1633   if (!Base.isReg())
1634     return;
1635 
1636   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1637   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1638       || Def->getNumOperands() != 5)
1639     return;
1640 
1641   MachineOperand BaseLo = Def->getOperand(1);
1642   MachineOperand BaseHi = Def->getOperand(3);
1643   if (!BaseLo.isReg() || !BaseHi.isReg())
1644     return;
1645 
1646   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1647   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1648 
1649   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1650       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1651     return;
1652 
1653   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1654   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1655 
1656   auto Offset0P = extractConstOffset(*Src0);
1657   if (Offset0P)
1658     BaseLo = *Src1;
1659   else {
1660     if (!(Offset0P = extractConstOffset(*Src1)))
1661       return;
1662     BaseLo = *Src0;
1663   }
1664 
1665   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1666   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1667 
1668   if (Src0->isImm())
1669     std::swap(Src0, Src1);
1670 
1671   if (!Src1->isImm())
1672     return;
1673 
1674   uint64_t Offset1 = Src1->getImm();
1675   BaseHi = *Src0;
1676 
1677   Addr.Base.LoReg = BaseLo.getReg();
1678   Addr.Base.HiReg = BaseHi.getReg();
1679   Addr.Base.LoSubReg = BaseLo.getSubReg();
1680   Addr.Base.HiSubReg = BaseHi.getSubReg();
1681   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1682 }
1683 
1684 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1685     MachineInstr &MI,
1686     MemInfoMap &Visited,
1687     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1688 
1689   if (!(MI.mayLoad() ^ MI.mayStore()))
1690     return false;
1691 
1692   // TODO: Support flat and scratch.
1693   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1694     return false;
1695 
1696   if (MI.mayLoad() &&
1697       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1698     return false;
1699 
1700   if (AnchorList.count(&MI))
1701     return false;
1702 
1703   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1704 
1705   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1706     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1707     return false;
1708   }
1709 
1710   // Step1: Find the base-registers and a 64bit constant offset.
1711   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1712   MemAddress MAddr;
1713   if (Visited.find(&MI) == Visited.end()) {
1714     processBaseWithConstOffset(Base, MAddr);
1715     Visited[&MI] = MAddr;
1716   } else
1717     MAddr = Visited[&MI];
1718 
1719   if (MAddr.Offset == 0) {
1720     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1721                          " constant offsets that can be promoted.\n";);
1722     return false;
1723   }
1724 
1725   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1726              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1727 
1728   // Step2: Traverse through MI's basic block and find an anchor(that has the
1729   // same base-registers) with the highest 13bit distance from MI's offset.
1730   // E.g. (64bit loads)
1731   // bb:
1732   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1733   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1734   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1735   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1736   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1737   //
1738   // Starting from the first load, the optimization will try to find a new base
1739   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1740   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1741   // as the new-base(anchor) because of the maximum distance which can
1742   // accommodate more intermediate bases presumably.
1743   //
1744   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1745   // (&a + 8192) for load1, load2, load4.
1746   //   addr = &a + 8192
1747   //   load1 = load(addr,       -4096)
1748   //   load2 = load(addr,       -2048)
1749   //   load3 = load(addr,       0)
1750   //   load4 = load(addr,       2048)
1751   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1752   //
1753   MachineInstr *AnchorInst = nullptr;
1754   MemAddress AnchorAddr;
1755   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1756   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1757 
1758   MachineBasicBlock *MBB = MI.getParent();
1759   MachineBasicBlock::iterator E = MBB->end();
1760   MachineBasicBlock::iterator MBBI = MI.getIterator();
1761   ++MBBI;
1762   const SITargetLowering *TLI =
1763     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1764 
1765   for ( ; MBBI != E; ++MBBI) {
1766     MachineInstr &MINext = *MBBI;
1767     // TODO: Support finding an anchor(with same base) from store addresses or
1768     // any other load addresses where the opcodes are different.
1769     if (MINext.getOpcode() != MI.getOpcode() ||
1770         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1771       continue;
1772 
1773     const MachineOperand &BaseNext =
1774       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1775     MemAddress MAddrNext;
1776     if (Visited.find(&MINext) == Visited.end()) {
1777       processBaseWithConstOffset(BaseNext, MAddrNext);
1778       Visited[&MINext] = MAddrNext;
1779     } else
1780       MAddrNext = Visited[&MINext];
1781 
1782     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1783         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1784         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1785         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1786       continue;
1787 
1788     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1789 
1790     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1791     TargetLoweringBase::AddrMode AM;
1792     AM.HasBaseReg = true;
1793     AM.BaseOffs = Dist;
1794     if (TLI->isLegalGlobalAddressingMode(AM) &&
1795         (uint32_t)std::abs(Dist) > MaxDist) {
1796       MaxDist = std::abs(Dist);
1797 
1798       AnchorAddr = MAddrNext;
1799       AnchorInst = &MINext;
1800     }
1801   }
1802 
1803   if (AnchorInst) {
1804     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1805                AnchorInst->dump());
1806     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1807                <<  AnchorAddr.Offset << "\n\n");
1808 
1809     // Instead of moving up, just re-compute anchor-instruction's base address.
1810     Register Base = computeBase(MI, AnchorAddr);
1811 
1812     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1813     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1814 
1815     for (auto P : InstsWCommonBase) {
1816       TargetLoweringBase::AddrMode AM;
1817       AM.HasBaseReg = true;
1818       AM.BaseOffs = P.second - AnchorAddr.Offset;
1819 
1820       if (TLI->isLegalGlobalAddressingMode(AM)) {
1821         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1822                    dbgs() << ")"; P.first->dump());
1823         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1824         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1825       }
1826     }
1827     AnchorList.insert(AnchorInst);
1828     return true;
1829   }
1830 
1831   return false;
1832 }
1833 
1834 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1835                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1836   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1837     if (AddrList.front().InstClass == CI.InstClass &&
1838         AddrList.front().IsAGPR == CI.IsAGPR &&
1839         AddrList.front().hasSameBaseAddress(*CI.I)) {
1840       AddrList.emplace_back(CI);
1841       return;
1842     }
1843   }
1844 
1845   // Base address not found, so add a new list.
1846   MergeableInsts.emplace_back(1, CI);
1847 }
1848 
1849 std::pair<MachineBasicBlock::iterator, bool>
1850 SILoadStoreOptimizer::collectMergeableInsts(
1851     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1852     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1853     std::list<std::list<CombineInfo>> &MergeableInsts) const {
1854   bool Modified = false;
1855 
1856   // Sort potential mergeable instructions into lists.  One list per base address.
1857   unsigned Order = 0;
1858   MachineBasicBlock::iterator BlockI = Begin;
1859   for (; BlockI != End; ++BlockI) {
1860     MachineInstr &MI = *BlockI;
1861 
1862     // We run this before checking if an address is mergeable, because it can produce
1863     // better code even if the instructions aren't mergeable.
1864     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1865       Modified = true;
1866 
1867     // Treat volatile accesses, ordered accesses and unmodeled side effects as
1868     // barriers. We can look after this barrier for separate merges.
1869     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
1870       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
1871 
1872       // Search will resume after this instruction in a separate merge list.
1873       ++BlockI;
1874       break;
1875     }
1876 
1877     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1878     if (InstClass == UNKNOWN)
1879       continue;
1880 
1881     // Do not merge VMEM buffer instructions with "swizzled" bit set.
1882     int Swizzled =
1883         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
1884     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
1885       continue;
1886 
1887     CombineInfo CI;
1888     CI.setMI(MI, *this);
1889     CI.Order = Order++;
1890 
1891     if (!CI.hasMergeableAddress(*MRI))
1892       continue;
1893 
1894     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
1895       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
1896       //        operands. However we are reporting that ds_write2 shall have
1897       //        only VGPR data so that machine copy propagation does not
1898       //        create an illegal instruction with a VGPR and AGPR sources.
1899       //        Consequenctially if we create such instruction the verifier
1900       //        will complain.
1901       continue;
1902     }
1903 
1904     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
1905 
1906     addInstToMergeableList(CI, MergeableInsts);
1907   }
1908 
1909   // At this point we have lists of Mergeable instructions.
1910   //
1911   // Part 2: Sort lists by offset and then for each CombineInfo object in the
1912   // list try to find an instruction that can be merged with I.  If an instruction
1913   // is found, it is stored in the Paired field.  If no instructions are found, then
1914   // the CombineInfo object is deleted from the list.
1915 
1916   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
1917                                                    E = MergeableInsts.end(); I != E;) {
1918 
1919     std::list<CombineInfo> &MergeList = *I;
1920     if (MergeList.size() <= 1) {
1921       // This means we have found only one instruction with a given address
1922       // that can be merged, and we need at least 2 instructions to do a merge,
1923       // so this list can be discarded.
1924       I = MergeableInsts.erase(I);
1925       continue;
1926     }
1927 
1928     // Sort the lists by offsets, this way mergeable instructions will be
1929     // adjacent to each other in the list, which will make it easier to find
1930     // matches.
1931     MergeList.sort(
1932         [] (const CombineInfo &A, const CombineInfo &B) {
1933           return A.Offset < B.Offset;
1934         });
1935     ++I;
1936   }
1937 
1938   return std::make_pair(BlockI, Modified);
1939 }
1940 
1941 // Scan through looking for adjacent LDS operations with constant offsets from
1942 // the same base register. We rely on the scheduler to do the hard work of
1943 // clustering nearby loads, and assume these are all adjacent.
1944 bool SILoadStoreOptimizer::optimizeBlock(
1945                        std::list<std::list<CombineInfo> > &MergeableInsts) {
1946   bool Modified = false;
1947 
1948   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
1949                                                    E = MergeableInsts.end(); I != E;) {
1950     std::list<CombineInfo> &MergeList = *I;
1951 
1952     bool OptimizeListAgain = false;
1953     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1954       // We weren't able to make any changes, so delete the list so we don't
1955       // process the same instructions the next time we try to optimize this
1956       // block.
1957       I = MergeableInsts.erase(I);
1958       continue;
1959     }
1960 
1961     Modified = true;
1962 
1963     // We made changes, but also determined that there were no more optimization
1964     // opportunities, so we don't need to reprocess the list
1965     if (!OptimizeListAgain) {
1966       I = MergeableInsts.erase(I);
1967       continue;
1968     }
1969     OptimizeAgain = true;
1970   }
1971   return Modified;
1972 }
1973 
1974 bool
1975 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1976                                           std::list<CombineInfo> &MergeList,
1977                                           bool &OptimizeListAgain) {
1978   if (MergeList.empty())
1979     return false;
1980 
1981   bool Modified = false;
1982 
1983   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
1984        Next = std::next(I)) {
1985 
1986     auto First = I;
1987     auto Second = Next;
1988 
1989     if ((*First).Order > (*Second).Order)
1990       std::swap(First, Second);
1991     CombineInfo &CI = *First;
1992     CombineInfo &Paired = *Second;
1993 
1994     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
1995     if (!Where) {
1996       ++I;
1997       continue;
1998     }
1999 
2000     Modified = true;
2001 
2002     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2003 
2004     MachineBasicBlock::iterator NewMI;
2005     switch (CI.InstClass) {
2006     default:
2007       llvm_unreachable("unknown InstClass");
2008       break;
2009     case DS_READ:
2010       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2011       break;
2012     case DS_WRITE:
2013       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2014       break;
2015     case S_BUFFER_LOAD_IMM:
2016       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2017       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2018       break;
2019     case BUFFER_LOAD:
2020       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2021       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2022       break;
2023     case BUFFER_STORE:
2024       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2025       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2026       break;
2027     case MIMG:
2028       NewMI = mergeImagePair(CI, Paired, Where->I);
2029       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2030       break;
2031     case TBUFFER_LOAD:
2032       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2033       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2034       break;
2035     case TBUFFER_STORE:
2036       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2037       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2038       break;
2039     }
2040     CI.setMI(NewMI, *this);
2041     CI.Order = Where->Order;
2042     if (I == Second)
2043       I = Next;
2044 
2045     MergeList.erase(Second);
2046   }
2047 
2048   return Modified;
2049 }
2050 
2051 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2052   if (skipFunction(MF.getFunction()))
2053     return false;
2054 
2055   STM = &MF.getSubtarget<GCNSubtarget>();
2056   if (!STM->loadStoreOptEnabled())
2057     return false;
2058 
2059   TII = STM->getInstrInfo();
2060   TRI = &TII->getRegisterInfo();
2061 
2062   MRI = &MF.getRegInfo();
2063   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2064 
2065   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2066 
2067   bool Modified = false;
2068 
2069   // Contains the list of instructions for which constant offsets are being
2070   // promoted to the IMM. This is tracked for an entire block at time.
2071   SmallPtrSet<MachineInstr *, 4> AnchorList;
2072   MemInfoMap Visited;
2073 
2074   for (MachineBasicBlock &MBB : MF) {
2075     MachineBasicBlock::iterator SectionEnd;
2076     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2077          I = SectionEnd) {
2078       bool CollectModified;
2079       std::list<std::list<CombineInfo>> MergeableInsts;
2080 
2081       // First pass: Collect list of all instructions we know how to merge in a
2082       // subset of the block.
2083       std::tie(SectionEnd, CollectModified) =
2084           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2085 
2086       Modified |= CollectModified;
2087 
2088       do {
2089         OptimizeAgain = false;
2090         Modified |= optimizeBlock(MergeableInsts);
2091       } while (OptimizeAgain);
2092     }
2093 
2094     Visited.clear();
2095     AnchorList.clear();
2096   }
2097 
2098   return Modified;
2099 }
2100