1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "AMDGPUSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
69 #include "llvm/Analysis/AliasAnalysis.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/InitializePasses.h"
79 #include "llvm/Pass.h"
80 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/MathExtras.h"
82 #include "llvm/Support/raw_ostream.h"
83 #include <algorithm>
84 #include <cassert>
85 #include <cstdlib>
86 #include <iterator>
87 #include <utility>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "si-load-store-opt"
92 
93 namespace {
94 enum InstClassEnum {
95   UNKNOWN,
96   DS_READ,
97   DS_WRITE,
98   S_BUFFER_LOAD_IMM,
99   BUFFER_LOAD,
100   BUFFER_STORE,
101   MIMG,
102   TBUFFER_LOAD,
103   TBUFFER_STORE,
104 };
105 
106 enum RegisterEnum {
107   SBASE = 0x1,
108   SRSRC = 0x2,
109   SOFFSET = 0x4,
110   VADDR = 0x8,
111   ADDR = 0x10,
112   SSAMP = 0x20,
113 };
114 
115 class SILoadStoreOptimizer : public MachineFunctionPass {
116   struct CombineInfo {
117     MachineBasicBlock::iterator I;
118     MachineBasicBlock::iterator Paired;
119     unsigned EltSize;
120     unsigned Offset0;
121     unsigned Offset1;
122     unsigned Width0;
123     unsigned Width1;
124     unsigned Format0;
125     unsigned Format1;
126     unsigned BaseOff;
127     unsigned DMask0;
128     unsigned DMask1;
129     InstClassEnum InstClass;
130     bool GLC0;
131     bool GLC1;
132     bool SLC0;
133     bool SLC1;
134     bool DLC0;
135     bool DLC1;
136     bool UseST64;
137     SmallVector<MachineInstr *, 8> InstsToMove;
138     int AddrIdx[5];
139     const MachineOperand *AddrReg[5];
140     unsigned NumAddresses;
141 
142     bool hasSameBaseAddress(const MachineInstr &MI) {
143       for (unsigned i = 0; i < NumAddresses; i++) {
144         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
145 
146         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
147           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
148               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
149             return false;
150           }
151           continue;
152         }
153 
154         // Check same base pointer. Be careful of subregisters, which can occur
155         // with vectors of pointers.
156         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
157             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
158          return false;
159         }
160       }
161       return true;
162     }
163 
164     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
165       for (unsigned i = 0; i < NumAddresses; ++i) {
166         const MachineOperand *AddrOp = AddrReg[i];
167         // Immediates are always OK.
168         if (AddrOp->isImm())
169           continue;
170 
171         // Don't try to merge addresses that aren't either immediates or registers.
172         // TODO: Should be possible to merge FrameIndexes and maybe some other
173         // non-register
174         if (!AddrOp->isReg())
175           return false;
176 
177         // TODO: We should be able to merge physical reg addreses.
178         if (Register::isPhysicalRegister(AddrOp->getReg()))
179           return false;
180 
181         // If an address has only one use then there will be on other
182         // instructions with the same address, so we can't merge this one.
183         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
184           return false;
185       }
186       return true;
187     }
188 
189     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
190                const GCNSubtarget &STM);
191     void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
192   };
193 
194   struct BaseRegisters {
195     unsigned LoReg = 0;
196     unsigned HiReg = 0;
197 
198     unsigned LoSubReg = 0;
199     unsigned HiSubReg = 0;
200   };
201 
202   struct MemAddress {
203     BaseRegisters Base;
204     int64_t Offset = 0;
205   };
206 
207   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
208 
209 private:
210   const GCNSubtarget *STM = nullptr;
211   const SIInstrInfo *TII = nullptr;
212   const SIRegisterInfo *TRI = nullptr;
213   const MCSubtargetInfo *STI = nullptr;
214   MachineRegisterInfo *MRI = nullptr;
215   AliasAnalysis *AA = nullptr;
216   bool OptimizeAgain;
217 
218   static bool dmasksCanBeCombined(const CombineInfo &CI,
219                                   const SIInstrInfo &TII);
220   static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI);
221   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
222   static unsigned getNewOpcode(const CombineInfo &CI);
223   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
224   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
225 
226   bool findMatchingInst(CombineInfo &CI);
227 
228   unsigned read2Opcode(unsigned EltSize) const;
229   unsigned read2ST64Opcode(unsigned EltSize) const;
230   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
231 
232   unsigned write2Opcode(unsigned EltSize) const;
233   unsigned write2ST64Opcode(unsigned EltSize) const;
234   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
235   MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
236   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
237   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
238   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
239   MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI);
240   MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI);
241 
242   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
243                            int32_t NewOffset) const;
244   unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
245   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
246   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
247   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
248   /// Promotes constant offset to the immediate by adjusting the base. It
249   /// tries to use a base from the nearby instructions that allows it to have
250   /// a 13bit constant offset which gets promoted to the immediate.
251   bool promoteConstantOffsetToImm(MachineInstr &CI,
252                                   MemInfoMap &Visited,
253                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
254   void addInstToMergeableList(const CombineInfo &CI,
255                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
256   bool collectMergeableInsts(MachineBasicBlock &MBB,
257                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
258 
259 public:
260   static char ID;
261 
262   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
263     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
264   }
265 
266   void removeCombinedInst(std::list<CombineInfo> &MergeList,
267                                          const MachineInstr &MI);
268   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
269                                      bool &OptimizeListAgain);
270   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
271 
272   bool runOnMachineFunction(MachineFunction &MF) override;
273 
274   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
275 
276   void getAnalysisUsage(AnalysisUsage &AU) const override {
277     AU.setPreservesCFG();
278     AU.addRequired<AAResultsWrapperPass>();
279 
280     MachineFunctionPass::getAnalysisUsage(AU);
281   }
282 };
283 
284 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
285   const unsigned Opc = MI.getOpcode();
286 
287   if (TII.isMUBUF(Opc)) {
288     // FIXME: Handle d16 correctly
289     return AMDGPU::getMUBUFElements(Opc);
290   }
291   if (TII.isMIMG(MI)) {
292     uint64_t DMaskImm =
293         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
294     return countPopulation(DMaskImm);
295   }
296   if (TII.isMTBUF(Opc)) {
297     return AMDGPU::getMTBUFElements(Opc);
298   }
299 
300   switch (Opc) {
301   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
302     return 1;
303   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
304     return 2;
305   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
306     return 4;
307   default:
308     return 0;
309   }
310 }
311 
312 /// Maps instruction opcode to enum InstClassEnum.
313 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
314   switch (Opc) {
315   default:
316     if (TII.isMUBUF(Opc)) {
317       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
318       default:
319         return UNKNOWN;
320       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
321       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
322       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
323       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
324         return BUFFER_LOAD;
325       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
326       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
327       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
328       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
329         return BUFFER_STORE;
330       }
331     }
332     if (TII.isMIMG(Opc)) {
333       // Ignore instructions encoded without vaddr.
334       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
335         return UNKNOWN;
336       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
337       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
338           TII.isGather4(Opc))
339         return UNKNOWN;
340       return MIMG;
341     }
342     if (TII.isMTBUF(Opc)) {
343       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
344       default:
345         return UNKNOWN;
346       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
347       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
348       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
349       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
350         return TBUFFER_LOAD;
351       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
352       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
353       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
354       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
355         return TBUFFER_STORE;
356       }
357     }
358     return UNKNOWN;
359   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
360   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
361   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
362     return S_BUFFER_LOAD_IMM;
363   case AMDGPU::DS_READ_B32:
364   case AMDGPU::DS_READ_B32_gfx9:
365   case AMDGPU::DS_READ_B64:
366   case AMDGPU::DS_READ_B64_gfx9:
367     return DS_READ;
368   case AMDGPU::DS_WRITE_B32:
369   case AMDGPU::DS_WRITE_B32_gfx9:
370   case AMDGPU::DS_WRITE_B64:
371   case AMDGPU::DS_WRITE_B64_gfx9:
372     return DS_WRITE;
373   }
374 }
375 
376 /// Determines instruction subclass from opcode. Only instructions
377 /// of the same subclass can be merged together.
378 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
379   switch (Opc) {
380   default:
381     if (TII.isMUBUF(Opc))
382       return AMDGPU::getMUBUFBaseOpcode(Opc);
383     if (TII.isMIMG(Opc)) {
384       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
385       assert(Info);
386       return Info->BaseOpcode;
387     }
388     if (TII.isMTBUF(Opc))
389       return AMDGPU::getMTBUFBaseOpcode(Opc);
390     return -1;
391   case AMDGPU::DS_READ_B32:
392   case AMDGPU::DS_READ_B32_gfx9:
393   case AMDGPU::DS_READ_B64:
394   case AMDGPU::DS_READ_B64_gfx9:
395   case AMDGPU::DS_WRITE_B32:
396   case AMDGPU::DS_WRITE_B32_gfx9:
397   case AMDGPU::DS_WRITE_B64:
398   case AMDGPU::DS_WRITE_B64_gfx9:
399     return Opc;
400   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
401   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
402   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
403     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
404   }
405 }
406 
407 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
408   if (TII.isMUBUF(Opc)) {
409     unsigned result = 0;
410 
411     if (AMDGPU::getMUBUFHasVAddr(Opc)) {
412       result |= VADDR;
413     }
414 
415     if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
416       result |= SRSRC;
417     }
418 
419     if (AMDGPU::getMUBUFHasSoffset(Opc)) {
420       result |= SOFFSET;
421     }
422 
423     return result;
424   }
425 
426   if (TII.isMIMG(Opc)) {
427     unsigned result = VADDR | SRSRC;
428     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
429     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
430       result |= SSAMP;
431 
432     return result;
433   }
434   if (TII.isMTBUF(Opc)) {
435     unsigned result = 0;
436 
437     if (AMDGPU::getMTBUFHasVAddr(Opc)) {
438       result |= VADDR;
439     }
440 
441     if (AMDGPU::getMTBUFHasSrsrc(Opc)) {
442       result |= SRSRC;
443     }
444 
445     if (AMDGPU::getMTBUFHasSoffset(Opc)) {
446       result |= SOFFSET;
447     }
448 
449     return result;
450   }
451 
452   switch (Opc) {
453   default:
454     return 0;
455   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
456   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
457   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
458     return SBASE;
459   case AMDGPU::DS_READ_B32:
460   case AMDGPU::DS_READ_B64:
461   case AMDGPU::DS_READ_B32_gfx9:
462   case AMDGPU::DS_READ_B64_gfx9:
463   case AMDGPU::DS_WRITE_B32:
464   case AMDGPU::DS_WRITE_B64:
465   case AMDGPU::DS_WRITE_B32_gfx9:
466   case AMDGPU::DS_WRITE_B64_gfx9:
467     return ADDR;
468   }
469 }
470 
471 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
472                                               const SIInstrInfo &TII,
473                                               const GCNSubtarget &STM) {
474   I = MI;
475   unsigned Opc = MI->getOpcode();
476   InstClass = getInstClass(Opc, TII);
477 
478   if (InstClass == UNKNOWN)
479     return;
480 
481   switch (InstClass) {
482   case DS_READ:
483    EltSize =
484           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
485                                                                           : 4;
486    break;
487   case DS_WRITE:
488     EltSize =
489           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
490                                                                             : 4;
491     break;
492   case S_BUFFER_LOAD_IMM:
493     EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
494     break;
495   default:
496     EltSize = 4;
497     break;
498   }
499 
500   if (InstClass == MIMG) {
501     DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
502   } else {
503     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
504     Offset0 = I->getOperand(OffsetIdx).getImm();
505   }
506 
507   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
508     Format0 = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
509 
510   Width0 = getOpcodeWidth(*I, TII);
511 
512   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
513     Offset0 &= 0xffff;
514   } else if (InstClass != MIMG) {
515     GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
516     if (InstClass != S_BUFFER_LOAD_IMM) {
517       SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
518     }
519     DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
520   }
521 
522   unsigned AddrOpName[5] = {0};
523   NumAddresses = 0;
524   const unsigned Regs = getRegs(I->getOpcode(), TII);
525 
526   if (Regs & ADDR) {
527     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
528   }
529 
530   if (Regs & SBASE) {
531     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
532   }
533 
534   if (Regs & SRSRC) {
535     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
536   }
537 
538   if (Regs & SOFFSET) {
539     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
540   }
541 
542   if (Regs & VADDR) {
543     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
544   }
545 
546   if (Regs & SSAMP) {
547     AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
548   }
549 
550   for (unsigned i = 0; i < NumAddresses; i++) {
551     AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
552     AddrReg[i] = &I->getOperand(AddrIdx[i]);
553   }
554 
555   InstsToMove.clear();
556 }
557 
558 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
559                                                   const SIInstrInfo &TII) {
560   Paired = MI;
561   assert(InstClass == getInstClass(Paired->getOpcode(), TII));
562 
563   if (InstClass == MIMG) {
564     DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
565   } else {
566     int OffsetIdx =
567         AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
568     Offset1 = Paired->getOperand(OffsetIdx).getImm();
569   }
570 
571   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
572     Format1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::format)->getImm();
573 
574   Width1 = getOpcodeWidth(*Paired, TII);
575   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
576     Offset1 &= 0xffff;
577   } else if (InstClass != MIMG) {
578     GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
579     if (InstClass != S_BUFFER_LOAD_IMM) {
580       SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
581     }
582     DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
583   }
584 }
585 
586 } // end anonymous namespace.
587 
588 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
589                       "SI Load Store Optimizer", false, false)
590 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
591 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
592                     false, false)
593 
594 char SILoadStoreOptimizer::ID = 0;
595 
596 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
597 
598 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
599   return new SILoadStoreOptimizer();
600 }
601 
602 static void moveInstsAfter(MachineBasicBlock::iterator I,
603                            ArrayRef<MachineInstr *> InstsToMove) {
604   MachineBasicBlock *MBB = I->getParent();
605   ++I;
606   for (MachineInstr *MI : InstsToMove) {
607     MI->removeFromParent();
608     MBB->insert(I, MI);
609   }
610 }
611 
612 static void addDefsUsesToList(const MachineInstr &MI,
613                               DenseSet<unsigned> &RegDefs,
614                               DenseSet<unsigned> &PhysRegUses) {
615   for (const MachineOperand &Op : MI.operands()) {
616     if (Op.isReg()) {
617       if (Op.isDef())
618         RegDefs.insert(Op.getReg());
619       else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
620         PhysRegUses.insert(Op.getReg());
621     }
622   }
623 }
624 
625 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
626                                       MachineBasicBlock::iterator B,
627                                       AliasAnalysis *AA) {
628   // RAW or WAR - cannot reorder
629   // WAW - cannot reorder
630   // RAR - safe to reorder
631   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
632 }
633 
634 // Add MI and its defs to the lists if MI reads one of the defs that are
635 // already in the list. Returns true in that case.
636 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
637                                   DenseSet<unsigned> &PhysRegUses,
638                                   SmallVectorImpl<MachineInstr *> &Insts) {
639   for (MachineOperand &Use : MI.operands()) {
640     // If one of the defs is read, then there is a use of Def between I and the
641     // instruction that I will potentially be merged with. We will need to move
642     // this instruction after the merged instructions.
643     //
644     // Similarly, if there is a def which is read by an instruction that is to
645     // be moved for merging, then we need to move the def-instruction as well.
646     // This can only happen for physical registers such as M0; virtual
647     // registers are in SSA form.
648     if (Use.isReg() &&
649         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
650          (Use.isDef() && RegDefs.count(Use.getReg())) ||
651          (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
652           PhysRegUses.count(Use.getReg())))) {
653       Insts.push_back(&MI);
654       addDefsUsesToList(MI, RegDefs, PhysRegUses);
655       return true;
656     }
657   }
658 
659   return false;
660 }
661 
662 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
663                                     ArrayRef<MachineInstr *> InstsToMove,
664                                     AliasAnalysis *AA) {
665   assert(MemOp.mayLoadOrStore());
666 
667   for (MachineInstr *InstToMove : InstsToMove) {
668     if (!InstToMove->mayLoadOrStore())
669       continue;
670     if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
671       return false;
672   }
673   return true;
674 }
675 
676 // This function assumes that \p A and \p B have are identical except for
677 // size and offset, and they referecne adjacent memory.
678 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
679                                                    const MachineMemOperand *A,
680                                                    const MachineMemOperand *B) {
681   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
682   unsigned Size = A->getSize() + B->getSize();
683   // This function adds the offset parameter to the existing offset for A,
684   // so we pass 0 here as the offset and then manually set it to the correct
685   // value after the call.
686   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
687   MMO->setOffset(MinOffset);
688   return MMO;
689 }
690 
691 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
692   assert(CI.InstClass == MIMG);
693 
694   // Ignore instructions with tfe/lwe set.
695   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
696   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
697 
698   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
699     return false;
700 
701   // Check other optional immediate operands for equality.
702   unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
703                                 AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
704                                 AMDGPU::OpName::da,  AMDGPU::OpName::r128};
705 
706   for (auto op : OperandsToMatch) {
707     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
708     if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
709       return false;
710     if (Idx != -1 &&
711         CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
712       return false;
713   }
714 
715   // Check DMask for overlaps.
716   unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
717   unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
718 
719   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
720   if ((1u << AllowedBitsForMin) <= MinMask)
721     return false;
722 
723   return true;
724 }
725 
726 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
727                                        unsigned ComponentCount,
728                                        const MCSubtargetInfo &STI) {
729   if (ComponentCount > 4)
730     return 0;
731 
732   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
733       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
734   if (!OldFormatInfo)
735     return 0;
736 
737   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
738       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
739                                            ComponentCount,
740                                            OldFormatInfo->NumFormat, STI);
741 
742   if (!NewFormatInfo)
743     return 0;
744 
745   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
746          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
747 
748   return NewFormatInfo->Format;
749 }
750 
751 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
752                                                 const MCSubtargetInfo &STI) {
753   assert(CI.InstClass != MIMG);
754 
755   // XXX - Would the same offset be OK? Is there any reason this would happen or
756   // be useful?
757   if (CI.Offset0 == CI.Offset1)
758     return false;
759 
760   // This won't be valid if the offset isn't aligned.
761   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
762     return false;
763 
764   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
765 
766     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
767         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format0, STI);
768     if (!Info0)
769       return false;
770     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
771         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format1, STI);
772     if (!Info1)
773       return false;
774 
775     if (Info0->BitsPerComp != Info1->BitsPerComp ||
776         Info0->NumFormat != Info1->NumFormat)
777       return false;
778 
779     // TODO: Should be possible to support more formats, but if format loads
780     // are not dword-aligned, the merged load might not be valid.
781     if (Info0->BitsPerComp != 32)
782       return false;
783 
784     if (getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, STI) == 0)
785       return false;
786   }
787 
788   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
789   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
790   CI.UseST64 = false;
791   CI.BaseOff = 0;
792 
793   // Handle SMEM and VMEM instructions.
794   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
795     return (EltOffset0 + CI.Width0 == EltOffset1 ||
796             EltOffset1 + CI.Width1 == EltOffset0) &&
797            CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
798            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
799   }
800 
801   // If the offset in elements doesn't fit in 8-bits, we might be able to use
802   // the stride 64 versions.
803   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
804       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
805     CI.Offset0 = EltOffset0 / 64;
806     CI.Offset1 = EltOffset1 / 64;
807     CI.UseST64 = true;
808     return true;
809   }
810 
811   // Check if the new offsets fit in the reduced 8-bit range.
812   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
813     CI.Offset0 = EltOffset0;
814     CI.Offset1 = EltOffset1;
815     return true;
816   }
817 
818   // Try to shift base address to decrease offsets.
819   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
820   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
821 
822   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
823     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
824     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
825     CI.UseST64 = true;
826     return true;
827   }
828 
829   if (isUInt<8>(OffsetDiff)) {
830     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
831     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
832     return true;
833   }
834 
835   return false;
836 }
837 
838 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
839                                      const CombineInfo &CI) {
840   const unsigned Width = (CI.Width0 + CI.Width1);
841   switch (CI.InstClass) {
842   default:
843     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
844   case S_BUFFER_LOAD_IMM:
845     switch (Width) {
846     default:
847       return false;
848     case 2:
849     case 4:
850       return true;
851     }
852   }
853 }
854 
855 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
856   MachineBasicBlock *MBB = CI.I->getParent();
857   MachineBasicBlock::iterator E = MBB->end();
858   MachineBasicBlock::iterator MBBI = CI.I;
859 
860   const unsigned Opc = CI.I->getOpcode();
861   const InstClassEnum InstClass = getInstClass(Opc, *TII);
862 
863   if (InstClass == UNKNOWN) {
864     return false;
865   }
866   const unsigned InstSubclass = getInstSubclass(Opc, *TII);
867 
868   // Do not merge VMEM buffer instructions with "swizzled" bit set.
869   int Swizzled =
870       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
871   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
872     return false;
873 
874   ++MBBI;
875 
876   DenseSet<unsigned> RegDefsToMove;
877   DenseSet<unsigned> PhysRegUsesToMove;
878   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
879 
880   for (; MBBI != E; ++MBBI) {
881 
882     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
883         (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
884       // This is not a matching instruction, but we can keep looking as
885       // long as one of these conditions are met:
886       // 1. It is safe to move I down past MBBI.
887       // 2. It is safe to move MBBI down past the instruction that I will
888       //    be merged into.
889 
890       if (MBBI->hasUnmodeledSideEffects()) {
891         // We can't re-order this instruction with respect to other memory
892         // operations, so we fail both conditions mentioned above.
893         return false;
894       }
895 
896       if (MBBI->mayLoadOrStore() &&
897           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
898            !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
899         // We fail condition #1, but we may still be able to satisfy condition
900         // #2.  Add this instruction to the move list and then we will check
901         // if condition #2 holds once we have selected the matching instruction.
902         CI.InstsToMove.push_back(&*MBBI);
903         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
904         continue;
905       }
906 
907       // When we match I with another DS instruction we will be moving I down
908       // to the location of the matched instruction any uses of I will need to
909       // be moved down as well.
910       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
911                             CI.InstsToMove);
912       continue;
913     }
914 
915     // Don't merge volatiles.
916     if (MBBI->hasOrderedMemoryRef())
917       return false;
918 
919     int Swizzled =
920         AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
921     if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
922       return false;
923 
924     // Handle a case like
925     //   DS_WRITE_B32 addr, v, idx0
926     //   w = DS_READ_B32 addr, idx0
927     //   DS_WRITE_B32 addr, f(w), idx1
928     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
929     // merging of the two writes.
930     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
931                               CI.InstsToMove))
932       continue;
933 
934     bool Match = CI.hasSameBaseAddress(*MBBI);
935 
936     if (Match) {
937       CI.setPaired(MBBI, *TII);
938 
939       // Check both offsets (or masks for MIMG) can be combined and fit in the
940       // reduced range.
941       bool canBeCombined =
942           CI.InstClass == MIMG
943               ? dmasksCanBeCombined(CI, *TII)
944               : widthsFit(*STM, CI) && offsetsCanBeCombined(CI, *STI);
945 
946       // We also need to go through the list of instructions that we plan to
947       // move and make sure they are all safe to move down past the merged
948       // instruction.
949       if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
950         return true;
951     }
952 
953     // We've found a load/store that we couldn't merge for some reason.
954     // We could potentially keep looking, but we'd need to make sure that
955     // it was safe to move I and also all the instruction in InstsToMove
956     // down past this instruction.
957     // check if we can move I across MBBI and if we can move all I's users
958     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
959         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
960       break;
961   }
962   return false;
963 }
964 
965 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
966   if (STM->ldsRequiresM0Init())
967     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
968   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
969 }
970 
971 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
972   if (STM->ldsRequiresM0Init())
973     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
974 
975   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
976                         : AMDGPU::DS_READ2ST64_B64_gfx9;
977 }
978 
979 MachineBasicBlock::iterator
980 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
981   MachineBasicBlock *MBB = CI.I->getParent();
982 
983   // Be careful, since the addresses could be subregisters themselves in weird
984   // cases, like vectors of pointers.
985   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
986 
987   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
988   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
989 
990   unsigned NewOffset0 = CI.Offset0;
991   unsigned NewOffset1 = CI.Offset1;
992   unsigned Opc =
993       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
994 
995   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
996   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
997 
998   if (NewOffset0 > NewOffset1) {
999     // Canonicalize the merged instruction so the smaller offset comes first.
1000     std::swap(NewOffset0, NewOffset1);
1001     std::swap(SubRegIdx0, SubRegIdx1);
1002   }
1003 
1004   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1005          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1006 
1007   const MCInstrDesc &Read2Desc = TII->get(Opc);
1008 
1009   const TargetRegisterClass *SuperRC =
1010       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
1011   Register DestReg = MRI->createVirtualRegister(SuperRC);
1012 
1013   DebugLoc DL = CI.I->getDebugLoc();
1014 
1015   Register BaseReg = AddrReg->getReg();
1016   unsigned BaseSubReg = AddrReg->getSubReg();
1017   unsigned BaseRegFlags = 0;
1018   if (CI.BaseOff) {
1019     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1020     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1021         .addImm(CI.BaseOff);
1022 
1023     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1024     BaseRegFlags = RegState::Kill;
1025 
1026     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
1027         .addReg(ImmReg)
1028         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1029         .addImm(0); // clamp bit
1030     BaseSubReg = 0;
1031   }
1032 
1033   MachineInstrBuilder Read2 =
1034       BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
1035           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1036           .addImm(NewOffset0)                        // offset0
1037           .addImm(NewOffset1)                        // offset1
1038           .addImm(0)                                 // gds
1039           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1040 
1041   (void)Read2;
1042 
1043   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1044 
1045   // Copy to the old destination registers.
1046   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1047       .add(*Dest0) // Copy to same destination including flags and sub reg.
1048       .addReg(DestReg, 0, SubRegIdx0);
1049   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1050                             .add(*Dest1)
1051                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1052 
1053   moveInstsAfter(Copy1, CI.InstsToMove);
1054 
1055   CI.I->eraseFromParent();
1056   CI.Paired->eraseFromParent();
1057 
1058   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1059   return Read2;
1060 }
1061 
1062 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1063   if (STM->ldsRequiresM0Init())
1064     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1065   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1066                         : AMDGPU::DS_WRITE2_B64_gfx9;
1067 }
1068 
1069 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1070   if (STM->ldsRequiresM0Init())
1071     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1072                           : AMDGPU::DS_WRITE2ST64_B64;
1073 
1074   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1075                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1076 }
1077 
1078 MachineBasicBlock::iterator
1079 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
1080   MachineBasicBlock *MBB = CI.I->getParent();
1081 
1082   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1083   // sure we preserve the subregister index and any register flags set on them.
1084   const MachineOperand *AddrReg =
1085       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1086   const MachineOperand *Data0 =
1087       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1088   const MachineOperand *Data1 =
1089       TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
1090 
1091   unsigned NewOffset0 = CI.Offset0;
1092   unsigned NewOffset1 = CI.Offset1;
1093   unsigned Opc =
1094       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1095 
1096   if (NewOffset0 > NewOffset1) {
1097     // Canonicalize the merged instruction so the smaller offset comes first.
1098     std::swap(NewOffset0, NewOffset1);
1099     std::swap(Data0, Data1);
1100   }
1101 
1102   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1103          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1104 
1105   const MCInstrDesc &Write2Desc = TII->get(Opc);
1106   DebugLoc DL = CI.I->getDebugLoc();
1107 
1108   Register BaseReg = AddrReg->getReg();
1109   unsigned BaseSubReg = AddrReg->getSubReg();
1110   unsigned BaseRegFlags = 0;
1111   if (CI.BaseOff) {
1112     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1113     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1114         .addImm(CI.BaseOff);
1115 
1116     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1117     BaseRegFlags = RegState::Kill;
1118 
1119     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
1120         .addReg(ImmReg)
1121         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1122         .addImm(0); // clamp bit
1123     BaseSubReg = 0;
1124   }
1125 
1126   MachineInstrBuilder Write2 =
1127       BuildMI(*MBB, CI.Paired, DL, Write2Desc)
1128           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1129           .add(*Data0)                               // data0
1130           .add(*Data1)                               // data1
1131           .addImm(NewOffset0)                        // offset0
1132           .addImm(NewOffset1)                        // offset1
1133           .addImm(0)                                 // gds
1134           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1135 
1136   moveInstsAfter(Write2, CI.InstsToMove);
1137 
1138   CI.I->eraseFromParent();
1139   CI.Paired->eraseFromParent();
1140 
1141   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1142   return Write2;
1143 }
1144 
1145 MachineBasicBlock::iterator
1146 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
1147   MachineBasicBlock *MBB = CI.I->getParent();
1148   DebugLoc DL = CI.I->getDebugLoc();
1149   const unsigned Opcode = getNewOpcode(CI);
1150 
1151   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1152 
1153   Register DestReg = MRI->createVirtualRegister(SuperRC);
1154   unsigned MergedDMask = CI.DMask0 | CI.DMask1;
1155   unsigned DMaskIdx =
1156       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1157 
1158   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1159   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1160     if (I == DMaskIdx)
1161       MIB.addImm(MergedDMask);
1162     else
1163       MIB.add((*CI.I).getOperand(I));
1164   }
1165 
1166   // It shouldn't be possible to get this far if the two instructions
1167   // don't have a single memoperand, because MachineInstr::mayAlias()
1168   // will return true if this is the case.
1169   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1170 
1171   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1172   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1173 
1174   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1175 
1176   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1177   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1178   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1179 
1180   // Copy to the old destination registers.
1181   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1182   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1183   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1184 
1185   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1186       .add(*Dest0) // Copy to same destination including flags and sub reg.
1187       .addReg(DestReg, 0, SubRegIdx0);
1188   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1189                             .add(*Dest1)
1190                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1191 
1192   moveInstsAfter(Copy1, CI.InstsToMove);
1193 
1194   CI.I->eraseFromParent();
1195   CI.Paired->eraseFromParent();
1196   return New;
1197 }
1198 
1199 MachineBasicBlock::iterator
1200 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
1201   MachineBasicBlock *MBB = CI.I->getParent();
1202   DebugLoc DL = CI.I->getDebugLoc();
1203   const unsigned Opcode = getNewOpcode(CI);
1204 
1205   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1206 
1207   Register DestReg = MRI->createVirtualRegister(SuperRC);
1208   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1209 
1210   // It shouldn't be possible to get this far if the two instructions
1211   // don't have a single memoperand, because MachineInstr::mayAlias()
1212   // will return true if this is the case.
1213   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1214 
1215   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1216   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1217 
1218   MachineInstr *New =
1219     BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
1220         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1221         .addImm(MergedOffset) // offset
1222         .addImm(CI.GLC0)      // glc
1223         .addImm(CI.DLC0)      // dlc
1224         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1225 
1226   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1227   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1228   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1229 
1230   // Copy to the old destination registers.
1231   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1232   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1233   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
1234 
1235   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1236       .add(*Dest0) // Copy to same destination including flags and sub reg.
1237       .addReg(DestReg, 0, SubRegIdx0);
1238   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1239                             .add(*Dest1)
1240                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1241 
1242   moveInstsAfter(Copy1, CI.InstsToMove);
1243 
1244   CI.I->eraseFromParent();
1245   CI.Paired->eraseFromParent();
1246   return New;
1247 }
1248 
1249 MachineBasicBlock::iterator
1250 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
1251   MachineBasicBlock *MBB = CI.I->getParent();
1252   DebugLoc DL = CI.I->getDebugLoc();
1253 
1254   const unsigned Opcode = getNewOpcode(CI);
1255 
1256   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1257 
1258   // Copy to the new source register.
1259   Register DestReg = MRI->createVirtualRegister(SuperRC);
1260   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1261 
1262   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1263 
1264   const unsigned Regs = getRegs(Opcode, *TII);
1265 
1266   if (Regs & VADDR)
1267     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1268 
1269   // It shouldn't be possible to get this far if the two instructions
1270   // don't have a single memoperand, because MachineInstr::mayAlias()
1271   // will return true if this is the case.
1272   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1273 
1274   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1275   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1276 
1277   MachineInstr *New =
1278     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1279         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1280         .addImm(MergedOffset) // offset
1281         .addImm(CI.GLC0)      // glc
1282         .addImm(CI.SLC0)      // slc
1283         .addImm(0)            // tfe
1284         .addImm(CI.DLC0)      // dlc
1285         .addImm(0)            // swz
1286         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1287 
1288   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1289   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1290   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1291 
1292   // Copy to the old destination registers.
1293   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1294   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1295   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1296 
1297   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1298       .add(*Dest0) // Copy to same destination including flags and sub reg.
1299       .addReg(DestReg, 0, SubRegIdx0);
1300   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1301                             .add(*Dest1)
1302                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1303 
1304   moveInstsAfter(Copy1, CI.InstsToMove);
1305 
1306   CI.I->eraseFromParent();
1307   CI.Paired->eraseFromParent();
1308   return New;
1309 }
1310 
1311 MachineBasicBlock::iterator
1312 SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI) {
1313   MachineBasicBlock *MBB = CI.I->getParent();
1314   DebugLoc DL = CI.I->getDebugLoc();
1315 
1316   const unsigned Opcode = getNewOpcode(CI);
1317 
1318   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1319 
1320   // Copy to the new source register.
1321   Register DestReg = MRI->createVirtualRegister(SuperRC);
1322   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1323 
1324   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1325 
1326   const unsigned Regs = getRegs(Opcode, *TII);
1327 
1328   if (Regs & VADDR)
1329     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1330 
1331   unsigned JoinedFormat =
1332       getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI);
1333 
1334   // It shouldn't be possible to get this far if the two instructions
1335   // don't have a single memoperand, because MachineInstr::mayAlias()
1336   // will return true if this is the case.
1337   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1338 
1339   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1340   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1341 
1342   MachineInstr *New =
1343       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1344           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1345           .addImm(MergedOffset) // offset
1346           .addImm(JoinedFormat) // format
1347           .addImm(CI.GLC0)      // glc
1348           .addImm(CI.SLC0)      // slc
1349           .addImm(0)            // tfe
1350           .addImm(CI.DLC0)      // dlc
1351           .addImm(0)            // swz
1352           .addMemOperand(
1353               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1354 
1355   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1356   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1357   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1358 
1359   // Copy to the old destination registers.
1360   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1361   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1362   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1363 
1364   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1365       .add(*Dest0) // Copy to same destination including flags and sub reg.
1366       .addReg(DestReg, 0, SubRegIdx0);
1367   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1368                             .add(*Dest1)
1369                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1370 
1371   moveInstsAfter(Copy1, CI.InstsToMove);
1372 
1373   CI.I->eraseFromParent();
1374   CI.Paired->eraseFromParent();
1375   return New;
1376 }
1377 
1378 MachineBasicBlock::iterator
1379 SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) {
1380   MachineBasicBlock *MBB = CI.I->getParent();
1381   DebugLoc DL = CI.I->getDebugLoc();
1382 
1383   const unsigned Opcode = getNewOpcode(CI);
1384 
1385   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1386   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1387   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1388 
1389   // Copy to the new source register.
1390   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1391   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1392 
1393   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1394   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1395 
1396   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1397       .add(*Src0)
1398       .addImm(SubRegIdx0)
1399       .add(*Src1)
1400       .addImm(SubRegIdx1);
1401 
1402   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1403                  .addReg(SrcReg, RegState::Kill);
1404 
1405   const unsigned Regs = getRegs(Opcode, *TII);
1406 
1407   if (Regs & VADDR)
1408     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1409 
1410   unsigned JoinedFormat =
1411       getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI);
1412 
1413   // It shouldn't be possible to get this far if the two instructions
1414   // don't have a single memoperand, because MachineInstr::mayAlias()
1415   // will return true if this is the case.
1416   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1417 
1418   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1419   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1420 
1421   MachineInstr *New =
1422       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1423           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1424           .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1425           .addImm(JoinedFormat)                     // format
1426           .addImm(CI.GLC0)                          // glc
1427           .addImm(CI.SLC0)                          // slc
1428           .addImm(0)                                // tfe
1429           .addImm(CI.DLC0)                          // dlc
1430           .addImm(0)                                // swz
1431           .addMemOperand(
1432               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1433 
1434   moveInstsAfter(MIB, CI.InstsToMove);
1435 
1436   CI.I->eraseFromParent();
1437   CI.Paired->eraseFromParent();
1438   return New;
1439 }
1440 
1441 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
1442   const unsigned Width = CI.Width0 + CI.Width1;
1443 
1444   switch (CI.InstClass) {
1445   default:
1446     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1447     // FIXME: Handle d16 correctly
1448     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1449                                   Width);
1450   case TBUFFER_LOAD:
1451   case TBUFFER_STORE:
1452     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1453                                   Width);
1454 
1455   case UNKNOWN:
1456     llvm_unreachable("Unknown instruction class");
1457   case S_BUFFER_LOAD_IMM:
1458     switch (Width) {
1459     default:
1460       return 0;
1461     case 2:
1462       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1463     case 4:
1464       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1465     }
1466   case MIMG:
1467     assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
1468     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1469   }
1470 }
1471 
1472 std::pair<unsigned, unsigned>
1473 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
1474 
1475   if (CI.Width0 == 0 || CI.Width1 == 0 || CI.Width0 + CI.Width1 > 4)
1476     return std::make_pair(0, 0);
1477 
1478   bool ReverseOrder;
1479   if (CI.InstClass == MIMG) {
1480     assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
1481            "No overlaps");
1482     ReverseOrder = CI.DMask0 > CI.DMask1;
1483   } else
1484     ReverseOrder = CI.Offset0 > CI.Offset1;
1485 
1486   static const unsigned Idxs[4][4] = {
1487       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1488       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1489       {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1490       {AMDGPU::sub3, 0, 0, 0},
1491   };
1492   unsigned Idx0;
1493   unsigned Idx1;
1494 
1495   assert(CI.Width0 >= 1 && CI.Width0 <= 3);
1496   assert(CI.Width1 >= 1 && CI.Width1 <= 3);
1497 
1498   if (ReverseOrder) {
1499     Idx1 = Idxs[0][CI.Width1 - 1];
1500     Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
1501   } else {
1502     Idx0 = Idxs[0][CI.Width0 - 1];
1503     Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
1504   }
1505 
1506   return std::make_pair(Idx0, Idx1);
1507 }
1508 
1509 const TargetRegisterClass *
1510 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1511   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1512     switch (CI.Width0 + CI.Width1) {
1513     default:
1514       return nullptr;
1515     case 2:
1516       return &AMDGPU::SReg_64_XEXECRegClass;
1517     case 4:
1518       return &AMDGPU::SGPR_128RegClass;
1519     case 8:
1520       return &AMDGPU::SReg_256RegClass;
1521     case 16:
1522       return &AMDGPU::SReg_512RegClass;
1523     }
1524   } else {
1525     switch (CI.Width0 + CI.Width1) {
1526     default:
1527       return nullptr;
1528     case 2:
1529       return &AMDGPU::VReg_64RegClass;
1530     case 3:
1531       return &AMDGPU::VReg_96RegClass;
1532     case 4:
1533       return &AMDGPU::VReg_128RegClass;
1534     }
1535   }
1536 }
1537 
1538 MachineBasicBlock::iterator
1539 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1540   MachineBasicBlock *MBB = CI.I->getParent();
1541   DebugLoc DL = CI.I->getDebugLoc();
1542 
1543   const unsigned Opcode = getNewOpcode(CI);
1544 
1545   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1546   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1547   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1548 
1549   // Copy to the new source register.
1550   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1551   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1552 
1553   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1554   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1555 
1556   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1557       .add(*Src0)
1558       .addImm(SubRegIdx0)
1559       .add(*Src1)
1560       .addImm(SubRegIdx1);
1561 
1562   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1563                  .addReg(SrcReg, RegState::Kill);
1564 
1565   const unsigned Regs = getRegs(Opcode, *TII);
1566 
1567   if (Regs & VADDR)
1568     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1569 
1570 
1571   // It shouldn't be possible to get this far if the two instructions
1572   // don't have a single memoperand, because MachineInstr::mayAlias()
1573   // will return true if this is the case.
1574   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1575 
1576   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1577   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1578 
1579   MachineInstr *New =
1580     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1581         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1582         .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1583         .addImm(CI.GLC0)      // glc
1584         .addImm(CI.SLC0)      // slc
1585         .addImm(0)            // tfe
1586         .addImm(CI.DLC0)      // dlc
1587         .addImm(0)            // swz
1588         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1589 
1590   moveInstsAfter(MIB, CI.InstsToMove);
1591 
1592   CI.I->eraseFromParent();
1593   CI.Paired->eraseFromParent();
1594   return New;
1595 }
1596 
1597 MachineOperand
1598 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1599   APInt V(32, Val, true);
1600   if (TII->isInlineConstant(V))
1601     return MachineOperand::CreateImm(Val);
1602 
1603   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1604   MachineInstr *Mov =
1605   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1606           TII->get(AMDGPU::S_MOV_B32), Reg)
1607     .addImm(Val);
1608   (void)Mov;
1609   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1610   return MachineOperand::CreateReg(Reg, false);
1611 }
1612 
1613 // Compute base address using Addr and return the final register.
1614 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1615                                            const MemAddress &Addr) const {
1616   MachineBasicBlock *MBB = MI.getParent();
1617   MachineBasicBlock::iterator MBBI = MI.getIterator();
1618   DebugLoc DL = MI.getDebugLoc();
1619 
1620   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1621           Addr.Base.LoSubReg) &&
1622          "Expected 32-bit Base-Register-Low!!");
1623 
1624   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1625           Addr.Base.HiSubReg) &&
1626          "Expected 32-bit Base-Register-Hi!!");
1627 
1628   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1629   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1630   MachineOperand OffsetHi =
1631     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1632 
1633   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1634   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1635   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1636 
1637   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1638   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1639   MachineInstr *LoHalf =
1640     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1641       .addReg(CarryReg, RegState::Define)
1642       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1643       .add(OffsetLo)
1644       .addImm(0); // clamp bit
1645   (void)LoHalf;
1646   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1647 
1648   MachineInstr *HiHalf =
1649   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1650     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1651     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1652     .add(OffsetHi)
1653     .addReg(CarryReg, RegState::Kill)
1654     .addImm(0); // clamp bit
1655   (void)HiHalf;
1656   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1657 
1658   Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1659   MachineInstr *FullBase =
1660     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1661       .addReg(DestSub0)
1662       .addImm(AMDGPU::sub0)
1663       .addReg(DestSub1)
1664       .addImm(AMDGPU::sub1);
1665   (void)FullBase;
1666   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1667 
1668   return FullDestReg;
1669 }
1670 
1671 // Update base and offset with the NewBase and NewOffset in MI.
1672 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1673                                                unsigned NewBase,
1674                                                int32_t NewOffset) const {
1675   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1676   Base->setReg(NewBase);
1677   Base->setIsKill(false);
1678   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1679 }
1680 
1681 Optional<int32_t>
1682 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1683   if (Op.isImm())
1684     return Op.getImm();
1685 
1686   if (!Op.isReg())
1687     return None;
1688 
1689   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1690   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1691       !Def->getOperand(1).isImm())
1692     return None;
1693 
1694   return Def->getOperand(1).getImm();
1695 }
1696 
1697 // Analyze Base and extracts:
1698 //  - 32bit base registers, subregisters
1699 //  - 64bit constant offset
1700 // Expecting base computation as:
1701 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1702 //   %LO:vgpr_32, %c:sreg_64_xexec =
1703 //       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1704 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1705 //   %Base:vreg_64 =
1706 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1707 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1708                                                       MemAddress &Addr) const {
1709   if (!Base.isReg())
1710     return;
1711 
1712   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1713   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1714       || Def->getNumOperands() != 5)
1715     return;
1716 
1717   MachineOperand BaseLo = Def->getOperand(1);
1718   MachineOperand BaseHi = Def->getOperand(3);
1719   if (!BaseLo.isReg() || !BaseHi.isReg())
1720     return;
1721 
1722   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1723   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1724 
1725   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1726       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1727     return;
1728 
1729   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1730   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1731 
1732   auto Offset0P = extractConstOffset(*Src0);
1733   if (Offset0P)
1734     BaseLo = *Src1;
1735   else {
1736     if (!(Offset0P = extractConstOffset(*Src1)))
1737       return;
1738     BaseLo = *Src0;
1739   }
1740 
1741   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1742   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1743 
1744   if (Src0->isImm())
1745     std::swap(Src0, Src1);
1746 
1747   if (!Src1->isImm())
1748     return;
1749 
1750   uint64_t Offset1 = Src1->getImm();
1751   BaseHi = *Src0;
1752 
1753   Addr.Base.LoReg = BaseLo.getReg();
1754   Addr.Base.HiReg = BaseHi.getReg();
1755   Addr.Base.LoSubReg = BaseLo.getSubReg();
1756   Addr.Base.HiSubReg = BaseHi.getSubReg();
1757   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1758 }
1759 
1760 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1761     MachineInstr &MI,
1762     MemInfoMap &Visited,
1763     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1764 
1765   if (!(MI.mayLoad() ^ MI.mayStore()))
1766     return false;
1767 
1768   // TODO: Support flat and scratch.
1769   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1770     return false;
1771 
1772   if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1773     return false;
1774 
1775   if (AnchorList.count(&MI))
1776     return false;
1777 
1778   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1779 
1780   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1781     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1782     return false;
1783   }
1784 
1785   // Step1: Find the base-registers and a 64bit constant offset.
1786   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1787   MemAddress MAddr;
1788   if (Visited.find(&MI) == Visited.end()) {
1789     processBaseWithConstOffset(Base, MAddr);
1790     Visited[&MI] = MAddr;
1791   } else
1792     MAddr = Visited[&MI];
1793 
1794   if (MAddr.Offset == 0) {
1795     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1796                          " constant offsets that can be promoted.\n";);
1797     return false;
1798   }
1799 
1800   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1801              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1802 
1803   // Step2: Traverse through MI's basic block and find an anchor(that has the
1804   // same base-registers) with the highest 13bit distance from MI's offset.
1805   // E.g. (64bit loads)
1806   // bb:
1807   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1808   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1809   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1810   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1811   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1812   //
1813   // Starting from the first load, the optimization will try to find a new base
1814   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1815   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1816   // as the new-base(anchor) because of the maximum distance which can
1817   // accomodate more intermediate bases presumeably.
1818   //
1819   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1820   // (&a + 8192) for load1, load2, load4.
1821   //   addr = &a + 8192
1822   //   load1 = load(addr,       -4096)
1823   //   load2 = load(addr,       -2048)
1824   //   load3 = load(addr,       0)
1825   //   load4 = load(addr,       2048)
1826   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1827   //
1828   MachineInstr *AnchorInst = nullptr;
1829   MemAddress AnchorAddr;
1830   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1831   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1832 
1833   MachineBasicBlock *MBB = MI.getParent();
1834   MachineBasicBlock::iterator E = MBB->end();
1835   MachineBasicBlock::iterator MBBI = MI.getIterator();
1836   ++MBBI;
1837   const SITargetLowering *TLI =
1838     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1839 
1840   for ( ; MBBI != E; ++MBBI) {
1841     MachineInstr &MINext = *MBBI;
1842     // TODO: Support finding an anchor(with same base) from store addresses or
1843     // any other load addresses where the opcodes are different.
1844     if (MINext.getOpcode() != MI.getOpcode() ||
1845         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1846       continue;
1847 
1848     const MachineOperand &BaseNext =
1849       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1850     MemAddress MAddrNext;
1851     if (Visited.find(&MINext) == Visited.end()) {
1852       processBaseWithConstOffset(BaseNext, MAddrNext);
1853       Visited[&MINext] = MAddrNext;
1854     } else
1855       MAddrNext = Visited[&MINext];
1856 
1857     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1858         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1859         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1860         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1861       continue;
1862 
1863     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1864 
1865     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1866     TargetLoweringBase::AddrMode AM;
1867     AM.HasBaseReg = true;
1868     AM.BaseOffs = Dist;
1869     if (TLI->isLegalGlobalAddressingMode(AM) &&
1870         (uint32_t)std::abs(Dist) > MaxDist) {
1871       MaxDist = std::abs(Dist);
1872 
1873       AnchorAddr = MAddrNext;
1874       AnchorInst = &MINext;
1875     }
1876   }
1877 
1878   if (AnchorInst) {
1879     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1880                AnchorInst->dump());
1881     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1882                <<  AnchorAddr.Offset << "\n\n");
1883 
1884     // Instead of moving up, just re-compute anchor-instruction's base address.
1885     unsigned Base = computeBase(MI, AnchorAddr);
1886 
1887     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1888     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1889 
1890     for (auto P : InstsWCommonBase) {
1891       TargetLoweringBase::AddrMode AM;
1892       AM.HasBaseReg = true;
1893       AM.BaseOffs = P.second - AnchorAddr.Offset;
1894 
1895       if (TLI->isLegalGlobalAddressingMode(AM)) {
1896         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1897                    dbgs() << ")"; P.first->dump());
1898         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1899         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1900       }
1901     }
1902     AnchorList.insert(AnchorInst);
1903     return true;
1904   }
1905 
1906   return false;
1907 }
1908 
1909 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1910                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1911   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1912     if (AddrList.front().InstClass == CI.InstClass &&
1913         AddrList.front().hasSameBaseAddress(*CI.I)) {
1914       AddrList.emplace_back(CI);
1915       return;
1916     }
1917   }
1918 
1919   // Base address not found, so add a new list.
1920   MergeableInsts.emplace_back(1, CI);
1921 }
1922 
1923 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
1924                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1925   bool Modified = false;
1926   // Contain the list
1927   MemInfoMap Visited;
1928   // Contains the list of instructions for which constant offsets are being
1929   // promoted to the IMM.
1930   SmallPtrSet<MachineInstr *, 4> AnchorList;
1931 
1932   // Sort potential mergeable instructions into lists.  One list per base address.
1933   for (MachineInstr &MI : MBB.instrs()) {
1934     // We run this before checking if an address is mergeable, because it can produce
1935     // better code even if the instructions aren't mergeable.
1936     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1937       Modified = true;
1938 
1939     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1940     if (InstClass == UNKNOWN)
1941       continue;
1942 
1943     // Don't combine if volatile.
1944     if (MI.hasOrderedMemoryRef())
1945       continue;
1946 
1947     CombineInfo CI;
1948     CI.setMI(MI, *TII, *STM);
1949 
1950     if (!CI.hasMergeableAddress(*MRI))
1951       continue;
1952 
1953     addInstToMergeableList(CI, MergeableInsts);
1954   }
1955   return Modified;
1956 }
1957 
1958 // Scan through looking for adjacent LDS operations with constant offsets from
1959 // the same base register. We rely on the scheduler to do the hard work of
1960 // clustering nearby loads, and assume these are all adjacent.
1961 bool SILoadStoreOptimizer::optimizeBlock(
1962                        std::list<std::list<CombineInfo> > &MergeableInsts) {
1963   bool Modified = false;
1964 
1965   for (std::list<CombineInfo> &MergeList : MergeableInsts) {
1966     if (MergeList.size() < 2)
1967       continue;
1968 
1969     bool OptimizeListAgain = false;
1970     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1971       // We weren't able to make any changes, so clear the list so we don't
1972       // process the same instructions the next time we try to optimize this
1973       // block.
1974       MergeList.clear();
1975       continue;
1976     }
1977 
1978     // We made changes, but also determined that there were no more optimization
1979     // opportunities, so we don't need to reprocess the list
1980     if (!OptimizeListAgain)
1981       MergeList.clear();
1982 
1983     OptimizeAgain |= OptimizeListAgain;
1984     Modified = true;
1985   }
1986   return Modified;
1987 }
1988 
1989 void
1990 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
1991                                          const MachineInstr &MI) {
1992 
1993   for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
1994     if (&*CI->I == &MI) {
1995       MergeList.erase(CI);
1996       return;
1997     }
1998   }
1999 }
2000 
2001 bool
2002 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2003                                           std::list<CombineInfo> &MergeList,
2004                                           bool &OptimizeListAgain) {
2005   bool Modified = false;
2006   for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
2007     CombineInfo &CI = *I;
2008 
2009     if (CI.InstClass == UNKNOWN)
2010       continue;
2011 
2012     if (!findMatchingInst(CI))
2013       goto done;
2014 
2015     Modified = true;
2016     removeCombinedInst(MergeList, *CI.Paired);
2017 
2018     switch (CI.InstClass) {
2019     default:
2020       llvm_unreachable("unknown InstClass");
2021       break;
2022     case DS_READ: {
2023       MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
2024       CI.setMI(NewMI, *TII, *STM);
2025       break;
2026     }
2027     case DS_WRITE: {
2028       MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
2029       CI.setMI(NewMI, *TII, *STM);
2030       break;
2031     }
2032     case S_BUFFER_LOAD_IMM: {
2033       MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
2034       CI.setMI(NewMI, *TII, *STM);
2035       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
2036       break;
2037     }
2038     case BUFFER_LOAD: {
2039       MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
2040       CI.setMI(NewMI, *TII, *STM);
2041       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
2042       break;
2043     }
2044     case BUFFER_STORE: {
2045       MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
2046       CI.setMI(NewMI, *TII, *STM);
2047       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
2048       break;
2049     }
2050     case MIMG: {
2051       MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
2052       CI.setMI(NewMI, *TII, *STM);
2053       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
2054       break;
2055     }
2056     case TBUFFER_LOAD: {
2057       MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI);
2058       CI.setMI(NewMI, *TII, *STM);
2059       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
2060       break;
2061     }
2062     case TBUFFER_STORE: {
2063       MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI);
2064       CI.setMI(NewMI, *TII, *STM);
2065       OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
2066       break;
2067     }
2068     }
2069 
2070 done:
2071     // Clear the InstsToMove after we have finished searching so we don't have
2072     // stale values left over if we search for this CI again in another pass
2073     // over the block.
2074     CI.InstsToMove.clear();
2075   }
2076 
2077   return Modified;
2078 }
2079 
2080 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2081   if (skipFunction(MF.getFunction()))
2082     return false;
2083 
2084   STM = &MF.getSubtarget<GCNSubtarget>();
2085   if (!STM->loadStoreOptEnabled())
2086     return false;
2087 
2088   TII = STM->getInstrInfo();
2089   TRI = &TII->getRegisterInfo();
2090   STI = &MF.getSubtarget<MCSubtargetInfo>();
2091 
2092   MRI = &MF.getRegInfo();
2093   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2094 
2095   assert(MRI->isSSA() && "Must be run on SSA");
2096 
2097   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2098 
2099   bool Modified = false;
2100 
2101 
2102   for (MachineBasicBlock &MBB : MF) {
2103     std::list<std::list<CombineInfo> > MergeableInsts;
2104     // First pass: Collect list of all instructions we know how to merge.
2105     Modified |= collectMergeableInsts(MBB, MergeableInsts);
2106     do {
2107       OptimizeAgain = false;
2108       Modified |= optimizeBlock(MergeableInsts);
2109     } while (OptimizeAgain);
2110   }
2111 
2112   return Modified;
2113 }
2114