1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82   GLOBAL_LOAD
83 };
84 
85 struct AddressRegs {
86   unsigned char NumVAddrs = 0;
87   bool SBase = false;
88   bool SRsrc = false;
89   bool SOffset = false;
90   bool VAddr = false;
91   bool Addr = false;
92   bool SSamp = false;
93 };
94 
95 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
96 const unsigned MaxAddressRegs = 12 + 1 + 1;
97 
98 class SILoadStoreOptimizer : public MachineFunctionPass {
99   struct CombineInfo {
100     MachineBasicBlock::iterator I;
101     unsigned EltSize;
102     unsigned Offset;
103     unsigned Width;
104     unsigned Format;
105     unsigned BaseOff;
106     unsigned DMask;
107     InstClassEnum InstClass;
108     unsigned CPol = 0;
109     bool IsAGPR;
110     bool UseST64;
111     int AddrIdx[MaxAddressRegs];
112     const MachineOperand *AddrReg[MaxAddressRegs];
113     unsigned NumAddresses;
114     unsigned Order;
115 
116     bool hasSameBaseAddress(const MachineInstr &MI) {
117       for (unsigned i = 0; i < NumAddresses; i++) {
118         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
119 
120         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
121           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
122               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
123             return false;
124           }
125           continue;
126         }
127 
128         // Check same base pointer. Be careful of subregisters, which can occur
129         // with vectors of pointers.
130         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
131             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
132          return false;
133         }
134       }
135       return true;
136     }
137 
138     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
139       for (unsigned i = 0; i < NumAddresses; ++i) {
140         const MachineOperand *AddrOp = AddrReg[i];
141         // Immediates are always OK.
142         if (AddrOp->isImm())
143           continue;
144 
145         // Don't try to merge addresses that aren't either immediates or registers.
146         // TODO: Should be possible to merge FrameIndexes and maybe some other
147         // non-register
148         if (!AddrOp->isReg())
149           return false;
150 
151         // TODO: We should be able to merge physical reg addresses.
152         if (AddrOp->getReg().isPhysical())
153           return false;
154 
155         // If an address has only one use then there will be on other
156         // instructions with the same address, so we can't merge this one.
157         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
158           return false;
159       }
160       return true;
161     }
162 
163     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
164   };
165 
166   struct BaseRegisters {
167     Register LoReg;
168     Register HiReg;
169 
170     unsigned LoSubReg = 0;
171     unsigned HiSubReg = 0;
172   };
173 
174   struct MemAddress {
175     BaseRegisters Base;
176     int64_t Offset = 0;
177   };
178 
179   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
180 
181 private:
182   const GCNSubtarget *STM = nullptr;
183   const SIInstrInfo *TII = nullptr;
184   const SIRegisterInfo *TRI = nullptr;
185   MachineRegisterInfo *MRI = nullptr;
186   AliasAnalysis *AA = nullptr;
187   bool OptimizeAgain;
188 
189   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
190                            const DenseSet<Register> &ARegUses,
191                            const MachineInstr &A, const MachineInstr &B) const;
192   static bool dmasksCanBeCombined(const CombineInfo &CI,
193                                   const SIInstrInfo &TII,
194                                   const CombineInfo &Paired);
195   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
196                                    CombineInfo &Paired, bool Modify = false);
197   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
198                         const CombineInfo &Paired);
199   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
200   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
201                                                      const CombineInfo &Paired);
202   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
203                                                     const CombineInfo &Paired);
204   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
205 
206   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
207 
208   unsigned read2Opcode(unsigned EltSize) const;
209   unsigned read2ST64Opcode(unsigned EltSize) const;
210   MachineBasicBlock::iterator
211   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
212                  MachineBasicBlock::iterator InsertBefore);
213 
214   unsigned write2Opcode(unsigned EltSize) const;
215   unsigned write2ST64Opcode(unsigned EltSize) const;
216   MachineBasicBlock::iterator
217   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
218                   MachineBasicBlock::iterator InsertBefore);
219   MachineBasicBlock::iterator
220   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
221                  MachineBasicBlock::iterator InsertBefore);
222   MachineBasicBlock::iterator
223   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
224                           MachineBasicBlock::iterator InsertBefore);
225   MachineBasicBlock::iterator
226   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
227                       MachineBasicBlock::iterator InsertBefore);
228   MachineBasicBlock::iterator
229   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
230                        MachineBasicBlock::iterator InsertBefore);
231   MachineBasicBlock::iterator
232   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
233                        MachineBasicBlock::iterator InsertBefore);
234   MachineBasicBlock::iterator
235   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
236                         MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
239                       MachineBasicBlock::iterator InsertBefore);
240 
241   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
242                            int32_t NewOffset) const;
243   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
244   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
245   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
246   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
247   /// Promotes constant offset to the immediate by adjusting the base. It
248   /// tries to use a base from the nearby instructions that allows it to have
249   /// a 13bit constant offset which gets promoted to the immediate.
250   bool promoteConstantOffsetToImm(MachineInstr &CI,
251                                   MemInfoMap &Visited,
252                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
253   void addInstToMergeableList(const CombineInfo &CI,
254                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
255 
256   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
257       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
258       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
259       std::list<std::list<CombineInfo>> &MergeableInsts) const;
260 
261 public:
262   static char ID;
263 
264   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
265     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
266   }
267 
268   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
269                                      bool &OptimizeListAgain);
270   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
271 
272   bool runOnMachineFunction(MachineFunction &MF) override;
273 
274   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
275 
276   void getAnalysisUsage(AnalysisUsage &AU) const override {
277     AU.setPreservesCFG();
278     AU.addRequired<AAResultsWrapperPass>();
279 
280     MachineFunctionPass::getAnalysisUsage(AU);
281   }
282 
283   MachineFunctionProperties getRequiredProperties() const override {
284     return MachineFunctionProperties()
285       .set(MachineFunctionProperties::Property::IsSSA);
286   }
287 };
288 
289 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
290   const unsigned Opc = MI.getOpcode();
291 
292   if (TII.isMUBUF(Opc)) {
293     // FIXME: Handle d16 correctly
294     return AMDGPU::getMUBUFElements(Opc);
295   }
296   if (TII.isMIMG(MI)) {
297     uint64_t DMaskImm =
298         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
299     return countPopulation(DMaskImm);
300   }
301   if (TII.isMTBUF(Opc)) {
302     return AMDGPU::getMTBUFElements(Opc);
303   }
304 
305   switch (Opc) {
306   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
307   case AMDGPU::GLOBAL_LOAD_DWORD:
308     return 1;
309   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
310   case AMDGPU::GLOBAL_LOAD_DWORDX2:
311     return 2;
312   case AMDGPU::GLOBAL_LOAD_DWORDX3:
313     return 3;
314   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
315   case AMDGPU::GLOBAL_LOAD_DWORDX4:
316     return 4;
317   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
318     return 8;
319   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
320   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
321   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
322   case AMDGPU::DS_WRITE_B32_gfx9:
323     return 1;
324   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
325   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
326   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
327   case AMDGPU::DS_WRITE_B64_gfx9:
328     return 2;
329   default:
330     return 0;
331   }
332 }
333 
334 /// Maps instruction opcode to enum InstClassEnum.
335 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
336   switch (Opc) {
337   default:
338     if (TII.isMUBUF(Opc)) {
339       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
340       default:
341         return UNKNOWN;
342       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
343       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
344       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
345       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
346         return BUFFER_LOAD;
347       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
348       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
349       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
350       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
351         return BUFFER_STORE;
352       }
353     }
354     if (TII.isMIMG(Opc)) {
355       // Ignore instructions encoded without vaddr.
356       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
357           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
358         return UNKNOWN;
359       // Ignore BVH instructions
360       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
361         return UNKNOWN;
362       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
363       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
364           TII.isGather4(Opc))
365         return UNKNOWN;
366       return MIMG;
367     }
368     if (TII.isMTBUF(Opc)) {
369       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
370       default:
371         return UNKNOWN;
372       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
373       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
374       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
375       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
376         return TBUFFER_LOAD;
377       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
378       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
379       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
380       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
381         return TBUFFER_STORE;
382       }
383     }
384     return UNKNOWN;
385   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
386   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
387   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
388   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
389     return S_BUFFER_LOAD_IMM;
390   case AMDGPU::DS_READ_B32:
391   case AMDGPU::DS_READ_B32_gfx9:
392   case AMDGPU::DS_READ_B64:
393   case AMDGPU::DS_READ_B64_gfx9:
394     return DS_READ;
395   case AMDGPU::DS_WRITE_B32:
396   case AMDGPU::DS_WRITE_B32_gfx9:
397   case AMDGPU::DS_WRITE_B64:
398   case AMDGPU::DS_WRITE_B64_gfx9:
399     return DS_WRITE;
400   case AMDGPU::GLOBAL_LOAD_DWORD:
401   case AMDGPU::GLOBAL_LOAD_DWORDX2:
402   case AMDGPU::GLOBAL_LOAD_DWORDX3:
403   case AMDGPU::GLOBAL_LOAD_DWORDX4:
404     return GLOBAL_LOAD;
405   }
406 }
407 
408 /// Determines instruction subclass from opcode. Only instructions
409 /// of the same subclass can be merged together. The merged instruction may have
410 /// a different subclass but must have the same class.
411 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
412   switch (Opc) {
413   default:
414     if (TII.isMUBUF(Opc))
415       return AMDGPU::getMUBUFBaseOpcode(Opc);
416     if (TII.isMIMG(Opc)) {
417       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
418       assert(Info);
419       return Info->BaseOpcode;
420     }
421     if (TII.isMTBUF(Opc))
422       return AMDGPU::getMTBUFBaseOpcode(Opc);
423     return -1;
424   case AMDGPU::DS_READ_B32:
425   case AMDGPU::DS_READ_B32_gfx9:
426   case AMDGPU::DS_READ_B64:
427   case AMDGPU::DS_READ_B64_gfx9:
428   case AMDGPU::DS_WRITE_B32:
429   case AMDGPU::DS_WRITE_B32_gfx9:
430   case AMDGPU::DS_WRITE_B64:
431   case AMDGPU::DS_WRITE_B64_gfx9:
432     return Opc;
433   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
434   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
435   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
436   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
437     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
438   case AMDGPU::GLOBAL_LOAD_DWORD:
439   case AMDGPU::GLOBAL_LOAD_DWORDX2:
440   case AMDGPU::GLOBAL_LOAD_DWORDX3:
441   case AMDGPU::GLOBAL_LOAD_DWORDX4:
442     return AMDGPU::GLOBAL_LOAD_DWORD;
443   }
444 }
445 
446 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
447   AddressRegs Result;
448 
449   if (TII.isMUBUF(Opc)) {
450     if (AMDGPU::getMUBUFHasVAddr(Opc))
451       Result.VAddr = true;
452     if (AMDGPU::getMUBUFHasSrsrc(Opc))
453       Result.SRsrc = true;
454     if (AMDGPU::getMUBUFHasSoffset(Opc))
455       Result.SOffset = true;
456 
457     return Result;
458   }
459 
460   if (TII.isMIMG(Opc)) {
461     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
462     if (VAddr0Idx >= 0) {
463       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
464       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
465     } else {
466       Result.VAddr = true;
467     }
468     Result.SRsrc = true;
469     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
470     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
471       Result.SSamp = true;
472 
473     return Result;
474   }
475   if (TII.isMTBUF(Opc)) {
476     if (AMDGPU::getMTBUFHasVAddr(Opc))
477       Result.VAddr = true;
478     if (AMDGPU::getMTBUFHasSrsrc(Opc))
479       Result.SRsrc = true;
480     if (AMDGPU::getMTBUFHasSoffset(Opc))
481       Result.SOffset = true;
482 
483     return Result;
484   }
485 
486   switch (Opc) {
487   default:
488     return Result;
489   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
490   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
491   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
492   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
493     Result.SBase = true;
494     return Result;
495   case AMDGPU::DS_READ_B32:
496   case AMDGPU::DS_READ_B64:
497   case AMDGPU::DS_READ_B32_gfx9:
498   case AMDGPU::DS_READ_B64_gfx9:
499   case AMDGPU::DS_WRITE_B32:
500   case AMDGPU::DS_WRITE_B64:
501   case AMDGPU::DS_WRITE_B32_gfx9:
502   case AMDGPU::DS_WRITE_B64_gfx9:
503     Result.Addr = true;
504     return Result;
505   case AMDGPU::GLOBAL_LOAD_DWORD:
506   case AMDGPU::GLOBAL_LOAD_DWORDX2:
507   case AMDGPU::GLOBAL_LOAD_DWORDX3:
508   case AMDGPU::GLOBAL_LOAD_DWORDX4:
509     Result.VAddr = true;
510     return Result;
511   }
512 }
513 
514 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
515                                               const SILoadStoreOptimizer &LSO) {
516   I = MI;
517   unsigned Opc = MI->getOpcode();
518   InstClass = getInstClass(Opc, *LSO.TII);
519 
520   if (InstClass == UNKNOWN)
521     return;
522 
523   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
524 
525   switch (InstClass) {
526   case DS_READ:
527    EltSize =
528           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
529                                                                           : 4;
530    break;
531   case DS_WRITE:
532     EltSize =
533           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
534                                                                             : 4;
535     break;
536   case S_BUFFER_LOAD_IMM:
537     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
538     break;
539   default:
540     EltSize = 4;
541     break;
542   }
543 
544   if (InstClass == MIMG) {
545     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
546     // Offset is not considered for MIMG instructions.
547     Offset = 0;
548   } else {
549     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
550     Offset = I->getOperand(OffsetIdx).getImm();
551   }
552 
553   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
554     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
555 
556   Width = getOpcodeWidth(*I, *LSO.TII);
557 
558   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
559     Offset &= 0xffff;
560   } else if (InstClass != MIMG) {
561     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
562   }
563 
564   AddressRegs Regs = getRegs(Opc, *LSO.TII);
565 
566   NumAddresses = 0;
567   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
568     AddrIdx[NumAddresses++] =
569         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
570   if (Regs.Addr)
571     AddrIdx[NumAddresses++] =
572         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
573   if (Regs.SBase)
574     AddrIdx[NumAddresses++] =
575         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
576   if (Regs.SRsrc)
577     AddrIdx[NumAddresses++] =
578         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
579   if (Regs.SOffset)
580     AddrIdx[NumAddresses++] =
581         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
582   if (Regs.VAddr)
583     AddrIdx[NumAddresses++] =
584         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
585   if (Regs.SSamp)
586     AddrIdx[NumAddresses++] =
587         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
588   assert(NumAddresses <= MaxAddressRegs);
589 
590   for (unsigned J = 0; J < NumAddresses; J++)
591     AddrReg[J] = &I->getOperand(AddrIdx[J]);
592 }
593 
594 } // end anonymous namespace.
595 
596 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
597                       "SI Load Store Optimizer", false, false)
598 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
599 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
600                     false, false)
601 
602 char SILoadStoreOptimizer::ID = 0;
603 
604 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
605 
606 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
607   return new SILoadStoreOptimizer();
608 }
609 
610 static void addDefsUsesToList(const MachineInstr &MI,
611                               DenseSet<Register> &RegDefs,
612                               DenseSet<Register> &RegUses) {
613   for (const auto &Op : MI.operands()) {
614     if (!Op.isReg())
615       continue;
616     if (Op.isDef())
617       RegDefs.insert(Op.getReg());
618     if (Op.readsReg())
619       RegUses.insert(Op.getReg());
620   }
621 }
622 
623 bool SILoadStoreOptimizer::canSwapInstructions(
624     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
625     const MachineInstr &A, const MachineInstr &B) const {
626   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
627       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
628     return false;
629   for (const auto &BOp : B.operands()) {
630     if (!BOp.isReg())
631       continue;
632     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
633       return false;
634     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
635       return false;
636   }
637   return true;
638 }
639 
640 // This function assumes that \p A and \p B have are identical except for
641 // size and offset, and they reference adjacent memory.
642 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
643                                                    const MachineMemOperand *A,
644                                                    const MachineMemOperand *B) {
645   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
646   unsigned Size = A->getSize() + B->getSize();
647   // This function adds the offset parameter to the existing offset for A,
648   // so we pass 0 here as the offset and then manually set it to the correct
649   // value after the call.
650   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
651   MMO->setOffset(MinOffset);
652   return MMO;
653 }
654 
655 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
656                                                const SIInstrInfo &TII,
657                                                const CombineInfo &Paired) {
658   assert(CI.InstClass == MIMG);
659 
660   // Ignore instructions with tfe/lwe set.
661   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
662   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
663 
664   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
665     return false;
666 
667   // Check other optional immediate operands for equality.
668   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
669                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
670                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
671 
672   for (auto op : OperandsToMatch) {
673     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
674     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
675       return false;
676     if (Idx != -1 &&
677         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
678       return false;
679   }
680 
681   // Check DMask for overlaps.
682   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
683   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
684 
685   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
686   if ((1u << AllowedBitsForMin) <= MinMask)
687     return false;
688 
689   return true;
690 }
691 
692 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
693                                        unsigned ComponentCount,
694                                        const GCNSubtarget &STI) {
695   if (ComponentCount > 4)
696     return 0;
697 
698   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
699       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
700   if (!OldFormatInfo)
701     return 0;
702 
703   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
704       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
705                                            ComponentCount,
706                                            OldFormatInfo->NumFormat, STI);
707 
708   if (!NewFormatInfo)
709     return 0;
710 
711   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
712          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
713 
714   return NewFormatInfo->Format;
715 }
716 
717 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
718 // highest power of two. Note that the result is well defined for all inputs
719 // including corner cases like:
720 // - if Lo == Hi, return that value
721 // - if Lo == 0, return 0 (even though the "- 1" below underflows
722 // - if Lo > Hi, return 0 (as if the range wrapped around)
723 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
724   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
725 }
726 
727 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
728                                                 const GCNSubtarget &STI,
729                                                 CombineInfo &Paired,
730                                                 bool Modify) {
731   assert(CI.InstClass != MIMG);
732 
733   // XXX - Would the same offset be OK? Is there any reason this would happen or
734   // be useful?
735   if (CI.Offset == Paired.Offset)
736     return false;
737 
738   // This won't be valid if the offset isn't aligned.
739   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
740     return false;
741 
742   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
743 
744     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
745         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
746     if (!Info0)
747       return false;
748     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
749         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
750     if (!Info1)
751       return false;
752 
753     if (Info0->BitsPerComp != Info1->BitsPerComp ||
754         Info0->NumFormat != Info1->NumFormat)
755       return false;
756 
757     // TODO: Should be possible to support more formats, but if format loads
758     // are not dword-aligned, the merged load might not be valid.
759     if (Info0->BitsPerComp != 32)
760       return false;
761 
762     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
763       return false;
764   }
765 
766   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
767   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
768   CI.UseST64 = false;
769   CI.BaseOff = 0;
770 
771   // Handle all non-DS instructions.
772   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
773     return (EltOffset0 + CI.Width == EltOffset1 ||
774             EltOffset1 + Paired.Width == EltOffset0) &&
775            CI.CPol == Paired.CPol;
776   }
777 
778   // If the offset in elements doesn't fit in 8-bits, we might be able to use
779   // the stride 64 versions.
780   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
781       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
782     if (Modify) {
783       CI.Offset = EltOffset0 / 64;
784       Paired.Offset = EltOffset1 / 64;
785       CI.UseST64 = true;
786     }
787     return true;
788   }
789 
790   // Check if the new offsets fit in the reduced 8-bit range.
791   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
792     if (Modify) {
793       CI.Offset = EltOffset0;
794       Paired.Offset = EltOffset1;
795     }
796     return true;
797   }
798 
799   // Try to shift base address to decrease offsets.
800   uint32_t Min = std::min(EltOffset0, EltOffset1);
801   uint32_t Max = std::max(EltOffset0, EltOffset1);
802 
803   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
804   if (((Max - Min) & ~Mask) == 0) {
805     if (Modify) {
806       // From the range of values we could use for BaseOff, choose the one that
807       // is aligned to the highest power of two, to maximise the chance that
808       // the same offset can be reused for other load/store pairs.
809       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
810       // Copy the low bits of the offsets, so that when we adjust them by
811       // subtracting BaseOff they will be multiples of 64.
812       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
813       CI.BaseOff = BaseOff * CI.EltSize;
814       CI.Offset = (EltOffset0 - BaseOff) / 64;
815       Paired.Offset = (EltOffset1 - BaseOff) / 64;
816       CI.UseST64 = true;
817     }
818     return true;
819   }
820 
821   if (isUInt<8>(Max - Min)) {
822     if (Modify) {
823       // From the range of values we could use for BaseOff, choose the one that
824       // is aligned to the highest power of two, to maximise the chance that
825       // the same offset can be reused for other load/store pairs.
826       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
827       CI.BaseOff = BaseOff * CI.EltSize;
828       CI.Offset = EltOffset0 - BaseOff;
829       Paired.Offset = EltOffset1 - BaseOff;
830     }
831     return true;
832   }
833 
834   return false;
835 }
836 
837 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
838                                      const CombineInfo &CI,
839                                      const CombineInfo &Paired) {
840   const unsigned Width = (CI.Width + Paired.Width);
841   switch (CI.InstClass) {
842   default:
843     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
844   case S_BUFFER_LOAD_IMM:
845     switch (Width) {
846     default:
847       return false;
848     case 2:
849     case 4:
850     case 8:
851       return true;
852     }
853   }
854 }
855 
856 const TargetRegisterClass *
857 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
858   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
859     return TRI->getRegClassForReg(*MRI, Dst->getReg());
860   }
861   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
862     return TRI->getRegClassForReg(*MRI, Src->getReg());
863   }
864   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
865     return TRI->getRegClassForReg(*MRI, Src->getReg());
866   }
867   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
868     return TRI->getRegClassForReg(*MRI, Dst->getReg());
869   }
870   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
871     return TRI->getRegClassForReg(*MRI, Src->getReg());
872   }
873   return nullptr;
874 }
875 
876 /// This function assumes that CI comes before Paired in a basic block. Return
877 /// an insertion point for the merged instruction or nullptr on failure.
878 SILoadStoreOptimizer::CombineInfo *
879 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
880                                            CombineInfo &Paired) {
881   // If another instruction has already been merged into CI, it may now be a
882   // type that we can't do any further merging into.
883   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
884     return nullptr;
885   assert(CI.InstClass == Paired.InstClass);
886 
887   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
888       getInstSubclass(Paired.I->getOpcode(), *TII))
889     return nullptr;
890 
891   // Check both offsets (or masks for MIMG) can be combined and fit in the
892   // reduced range.
893   if (CI.InstClass == MIMG) {
894     if (!dmasksCanBeCombined(CI, *TII, Paired))
895       return nullptr;
896   } else {
897     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
898       return nullptr;
899   }
900 
901   DenseSet<Register> RegDefs;
902   DenseSet<Register> RegUses;
903   CombineInfo *Where;
904   if (CI.I->mayLoad()) {
905     // Try to hoist Paired up to CI.
906     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
907     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
908       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
909         return nullptr;
910     }
911     Where = &CI;
912   } else {
913     // Try to sink CI down to Paired.
914     addDefsUsesToList(*CI.I, RegDefs, RegUses);
915     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
916       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
917         return nullptr;
918     }
919     Where = &Paired;
920   }
921 
922   // Call offsetsCanBeCombined with modify = true so that the offsets are
923   // correct for the new instruction.  This should return true, because
924   // this function should only be called on CombineInfo objects that
925   // have already been confirmed to be mergeable.
926   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
927     offsetsCanBeCombined(CI, *STM, Paired, true);
928   return Where;
929 }
930 
931 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
932   if (STM->ldsRequiresM0Init())
933     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
934   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
935 }
936 
937 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
938   if (STM->ldsRequiresM0Init())
939     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
940 
941   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
942                         : AMDGPU::DS_READ2ST64_B64_gfx9;
943 }
944 
945 MachineBasicBlock::iterator
946 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
947                                      MachineBasicBlock::iterator InsertBefore) {
948   MachineBasicBlock *MBB = CI.I->getParent();
949 
950   // Be careful, since the addresses could be subregisters themselves in weird
951   // cases, like vectors of pointers.
952   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
953 
954   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
955   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
956 
957   unsigned NewOffset0 = CI.Offset;
958   unsigned NewOffset1 = Paired.Offset;
959   unsigned Opc =
960       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
961 
962   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
963   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
964 
965   if (NewOffset0 > NewOffset1) {
966     // Canonicalize the merged instruction so the smaller offset comes first.
967     std::swap(NewOffset0, NewOffset1);
968     std::swap(SubRegIdx0, SubRegIdx1);
969   }
970 
971   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
972          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
973 
974   const MCInstrDesc &Read2Desc = TII->get(Opc);
975 
976   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
977   Register DestReg = MRI->createVirtualRegister(SuperRC);
978 
979   DebugLoc DL = CI.I->getDebugLoc();
980 
981   Register BaseReg = AddrReg->getReg();
982   unsigned BaseSubReg = AddrReg->getSubReg();
983   unsigned BaseRegFlags = 0;
984   if (CI.BaseOff) {
985     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
986     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
987         .addImm(CI.BaseOff);
988 
989     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
990     BaseRegFlags = RegState::Kill;
991 
992     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
993         .addReg(ImmReg)
994         .addReg(AddrReg->getReg(), 0, BaseSubReg)
995         .addImm(0); // clamp bit
996     BaseSubReg = 0;
997   }
998 
999   MachineInstrBuilder Read2 =
1000       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1001           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1002           .addImm(NewOffset0)                        // offset0
1003           .addImm(NewOffset1)                        // offset1
1004           .addImm(0)                                 // gds
1005           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1006 
1007   (void)Read2;
1008 
1009   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1010 
1011   // Copy to the old destination registers.
1012   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1013       .add(*Dest0) // Copy to same destination including flags and sub reg.
1014       .addReg(DestReg, 0, SubRegIdx0);
1015   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1016       .add(*Dest1)
1017       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1018 
1019   CI.I->eraseFromParent();
1020   Paired.I->eraseFromParent();
1021 
1022   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1023   return Read2;
1024 }
1025 
1026 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1027   if (STM->ldsRequiresM0Init())
1028     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1029   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1030                         : AMDGPU::DS_WRITE2_B64_gfx9;
1031 }
1032 
1033 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1034   if (STM->ldsRequiresM0Init())
1035     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1036                           : AMDGPU::DS_WRITE2ST64_B64;
1037 
1038   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1039                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1040 }
1041 
1042 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1043     CombineInfo &CI, CombineInfo &Paired,
1044     MachineBasicBlock::iterator InsertBefore) {
1045   MachineBasicBlock *MBB = CI.I->getParent();
1046 
1047   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1048   // sure we preserve the subregister index and any register flags set on them.
1049   const MachineOperand *AddrReg =
1050       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1051   const MachineOperand *Data0 =
1052       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1053   const MachineOperand *Data1 =
1054       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1055 
1056   unsigned NewOffset0 = CI.Offset;
1057   unsigned NewOffset1 = Paired.Offset;
1058   unsigned Opc =
1059       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1060 
1061   if (NewOffset0 > NewOffset1) {
1062     // Canonicalize the merged instruction so the smaller offset comes first.
1063     std::swap(NewOffset0, NewOffset1);
1064     std::swap(Data0, Data1);
1065   }
1066 
1067   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1068          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1069 
1070   const MCInstrDesc &Write2Desc = TII->get(Opc);
1071   DebugLoc DL = CI.I->getDebugLoc();
1072 
1073   Register BaseReg = AddrReg->getReg();
1074   unsigned BaseSubReg = AddrReg->getSubReg();
1075   unsigned BaseRegFlags = 0;
1076   if (CI.BaseOff) {
1077     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1078     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1079         .addImm(CI.BaseOff);
1080 
1081     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1082     BaseRegFlags = RegState::Kill;
1083 
1084     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1085         .addReg(ImmReg)
1086         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1087         .addImm(0); // clamp bit
1088     BaseSubReg = 0;
1089   }
1090 
1091   MachineInstrBuilder Write2 =
1092       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1093           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1094           .add(*Data0)                               // data0
1095           .add(*Data1)                               // data1
1096           .addImm(NewOffset0)                        // offset0
1097           .addImm(NewOffset1)                        // offset1
1098           .addImm(0)                                 // gds
1099           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1100 
1101   CI.I->eraseFromParent();
1102   Paired.I->eraseFromParent();
1103 
1104   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1105   return Write2;
1106 }
1107 
1108 MachineBasicBlock::iterator
1109 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1110                                      MachineBasicBlock::iterator InsertBefore) {
1111   MachineBasicBlock *MBB = CI.I->getParent();
1112   DebugLoc DL = CI.I->getDebugLoc();
1113   const unsigned Opcode = getNewOpcode(CI, Paired);
1114 
1115   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1116 
1117   Register DestReg = MRI->createVirtualRegister(SuperRC);
1118   unsigned MergedDMask = CI.DMask | Paired.DMask;
1119   unsigned DMaskIdx =
1120       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1121 
1122   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1123   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1124     if (I == DMaskIdx)
1125       MIB.addImm(MergedDMask);
1126     else
1127       MIB.add((*CI.I).getOperand(I));
1128   }
1129 
1130   // It shouldn't be possible to get this far if the two instructions
1131   // don't have a single memoperand, because MachineInstr::mayAlias()
1132   // will return true if this is the case.
1133   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1134 
1135   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1136   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1137 
1138   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1139 
1140   unsigned SubRegIdx0, SubRegIdx1;
1141   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1142 
1143   // Copy to the old destination registers.
1144   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1145   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1146   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1147 
1148   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1149       .add(*Dest0) // Copy to same destination including flags and sub reg.
1150       .addReg(DestReg, 0, SubRegIdx0);
1151   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1152       .add(*Dest1)
1153       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1154 
1155   CI.I->eraseFromParent();
1156   Paired.I->eraseFromParent();
1157   return New;
1158 }
1159 
1160 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1161     CombineInfo &CI, CombineInfo &Paired,
1162     MachineBasicBlock::iterator InsertBefore) {
1163   MachineBasicBlock *MBB = CI.I->getParent();
1164   DebugLoc DL = CI.I->getDebugLoc();
1165   const unsigned Opcode = getNewOpcode(CI, Paired);
1166 
1167   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1168 
1169   Register DestReg = MRI->createVirtualRegister(SuperRC);
1170   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1171 
1172   // It shouldn't be possible to get this far if the two instructions
1173   // don't have a single memoperand, because MachineInstr::mayAlias()
1174   // will return true if this is the case.
1175   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1176 
1177   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1178   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1179 
1180   MachineInstr *New =
1181       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1182           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1183           .addImm(MergedOffset) // offset
1184           .addImm(CI.CPol)      // cpol
1185           .addMemOperand(
1186               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1187 
1188   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1189   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1190   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1191 
1192   // Copy to the old destination registers.
1193   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1194   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1195   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1196 
1197   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1198       .add(*Dest0) // Copy to same destination including flags and sub reg.
1199       .addReg(DestReg, 0, SubRegIdx0);
1200   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1201       .add(*Dest1)
1202       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1203 
1204   CI.I->eraseFromParent();
1205   Paired.I->eraseFromParent();
1206   return New;
1207 }
1208 
1209 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1210     CombineInfo &CI, CombineInfo &Paired,
1211     MachineBasicBlock::iterator InsertBefore) {
1212   MachineBasicBlock *MBB = CI.I->getParent();
1213   DebugLoc DL = CI.I->getDebugLoc();
1214 
1215   const unsigned Opcode = getNewOpcode(CI, Paired);
1216 
1217   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1218 
1219   // Copy to the new source register.
1220   Register DestReg = MRI->createVirtualRegister(SuperRC);
1221   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1222 
1223   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1224 
1225   AddressRegs Regs = getRegs(Opcode, *TII);
1226 
1227   if (Regs.VAddr)
1228     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1229 
1230   // It shouldn't be possible to get this far if the two instructions
1231   // don't have a single memoperand, because MachineInstr::mayAlias()
1232   // will return true if this is the case.
1233   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1234 
1235   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1236   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1237 
1238   MachineInstr *New =
1239     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1240         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1241         .addImm(MergedOffset) // offset
1242         .addImm(CI.CPol)      // cpol
1243         .addImm(0)            // tfe
1244         .addImm(0)            // swz
1245         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1246 
1247   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1248   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1249   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1250 
1251   // Copy to the old destination registers.
1252   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1253   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1254   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1255 
1256   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1257       .add(*Dest0) // Copy to same destination including flags and sub reg.
1258       .addReg(DestReg, 0, SubRegIdx0);
1259   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1260       .add(*Dest1)
1261       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1262 
1263   CI.I->eraseFromParent();
1264   Paired.I->eraseFromParent();
1265   return New;
1266 }
1267 
1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1269     CombineInfo &CI, CombineInfo &Paired,
1270     MachineBasicBlock::iterator InsertBefore) {
1271   MachineBasicBlock *MBB = CI.I->getParent();
1272   DebugLoc DL = CI.I->getDebugLoc();
1273 
1274   const unsigned Opcode = getNewOpcode(CI, Paired);
1275 
1276   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1277 
1278   // Copy to the new source register.
1279   Register DestReg = MRI->createVirtualRegister(SuperRC);
1280   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1281 
1282   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1283 
1284   AddressRegs Regs = getRegs(Opcode, *TII);
1285 
1286   if (Regs.VAddr)
1287     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1288 
1289   unsigned JoinedFormat =
1290       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1291 
1292   // It shouldn't be possible to get this far if the two instructions
1293   // don't have a single memoperand, because MachineInstr::mayAlias()
1294   // will return true if this is the case.
1295   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1296 
1297   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1298   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1299 
1300   MachineInstr *New =
1301       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1302           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1303           .addImm(MergedOffset) // offset
1304           .addImm(JoinedFormat) // format
1305           .addImm(CI.CPol)      // cpol
1306           .addImm(0)            // tfe
1307           .addImm(0)            // swz
1308           .addMemOperand(
1309               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1310 
1311   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1312   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1313   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1314 
1315   // Copy to the old destination registers.
1316   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1317   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1318   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1319 
1320   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1321       .add(*Dest0) // Copy to same destination including flags and sub reg.
1322       .addReg(DestReg, 0, SubRegIdx0);
1323   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1324       .add(*Dest1)
1325       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1326 
1327   CI.I->eraseFromParent();
1328   Paired.I->eraseFromParent();
1329   return New;
1330 }
1331 
1332 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1333     CombineInfo &CI, CombineInfo &Paired,
1334     MachineBasicBlock::iterator InsertBefore) {
1335   MachineBasicBlock *MBB = CI.I->getParent();
1336   DebugLoc DL = CI.I->getDebugLoc();
1337 
1338   const unsigned Opcode = getNewOpcode(CI, Paired);
1339 
1340   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1341   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1342   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1343 
1344   // Copy to the new source register.
1345   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1346   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1347 
1348   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1349   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1350 
1351   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1352       .add(*Src0)
1353       .addImm(SubRegIdx0)
1354       .add(*Src1)
1355       .addImm(SubRegIdx1);
1356 
1357   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1358                  .addReg(SrcReg, RegState::Kill);
1359 
1360   AddressRegs Regs = getRegs(Opcode, *TII);
1361 
1362   if (Regs.VAddr)
1363     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1364 
1365   unsigned JoinedFormat =
1366       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1367 
1368   // It shouldn't be possible to get this far if the two instructions
1369   // don't have a single memoperand, because MachineInstr::mayAlias()
1370   // will return true if this is the case.
1371   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1372 
1373   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1374   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1375 
1376   MachineInstr *New =
1377       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1378           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1379           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1380           .addImm(JoinedFormat)                     // format
1381           .addImm(CI.CPol)                          // cpol
1382           .addImm(0)                                // tfe
1383           .addImm(0)                                // swz
1384           .addMemOperand(
1385               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1386 
1387   CI.I->eraseFromParent();
1388   Paired.I->eraseFromParent();
1389   return New;
1390 }
1391 
1392 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
1393     CombineInfo &CI, CombineInfo &Paired,
1394     MachineBasicBlock::iterator InsertBefore) {
1395   MachineBasicBlock *MBB = CI.I->getParent();
1396   DebugLoc DL = CI.I->getDebugLoc();
1397 
1398   const unsigned Opcode = getNewOpcode(CI, Paired);
1399 
1400   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1401   Register DestReg = MRI->createVirtualRegister(SuperRC);
1402 
1403   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1404 
1405   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1406   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1407 
1408   MachineInstr *New =
1409     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1410        .addImm(std::min(CI.Offset, Paired.Offset))
1411        .addImm(CI.CPol)
1412        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1413 
1414   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1415   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1416   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1417 
1418   // Copy to the old destination registers.
1419   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1420   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1421   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1422 
1423   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1424       .add(*Dest0) // Copy to same destination including flags and sub reg.
1425       .addReg(DestReg, 0, SubRegIdx0);
1426   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1427       .add(*Dest1)
1428       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1429 
1430   CI.I->eraseFromParent();
1431   Paired.I->eraseFromParent();
1432   return New;
1433 }
1434 
1435 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1436                                             const CombineInfo &Paired) {
1437   const unsigned Width = CI.Width + Paired.Width;
1438 
1439   switch (CI.InstClass) {
1440   default:
1441     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1442     // FIXME: Handle d16 correctly
1443     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1444                                   Width);
1445   case TBUFFER_LOAD:
1446   case TBUFFER_STORE:
1447     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1448                                   Width);
1449 
1450   case UNKNOWN:
1451     llvm_unreachable("Unknown instruction class");
1452   case S_BUFFER_LOAD_IMM:
1453     switch (Width) {
1454     default:
1455       return 0;
1456     case 2:
1457       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1458     case 4:
1459       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1460     case 8:
1461       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1462     }
1463   case GLOBAL_LOAD:
1464     switch (Width) {
1465     default:
1466       return 0;
1467     case 2:
1468       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1469     case 3:
1470       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1471     case 4:
1472       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1473     }
1474   case MIMG:
1475     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1476            "No overlaps");
1477     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1478   }
1479 }
1480 
1481 std::pair<unsigned, unsigned>
1482 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1483                                     const CombineInfo &Paired) {
1484   bool ReverseOrder;
1485   if (CI.InstClass == MIMG) {
1486     assert(
1487         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1488         "No overlaps");
1489     ReverseOrder = CI.DMask > Paired.DMask;
1490   } else {
1491     ReverseOrder = CI.Offset > Paired.Offset;
1492   }
1493 
1494   unsigned Idx0;
1495   unsigned Idx1;
1496 
1497   static const unsigned Idxs[5][4] = {
1498       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1499       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1500       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1501       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1502       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1503   };
1504 
1505   assert(CI.Width >= 1 && CI.Width <= 4);
1506   assert(Paired.Width >= 1 && Paired.Width <= 4);
1507 
1508   if (ReverseOrder) {
1509     Idx1 = Idxs[0][Paired.Width - 1];
1510     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1511   } else {
1512     Idx0 = Idxs[0][CI.Width - 1];
1513     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1514   }
1515 
1516   return std::make_pair(Idx0, Idx1);
1517 }
1518 
1519 const TargetRegisterClass *
1520 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1521                                              const CombineInfo &Paired) {
1522   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1523     switch (CI.Width + Paired.Width) {
1524     default:
1525       return nullptr;
1526     case 2:
1527       return &AMDGPU::SReg_64_XEXECRegClass;
1528     case 4:
1529       return &AMDGPU::SGPR_128RegClass;
1530     case 8:
1531       return &AMDGPU::SGPR_256RegClass;
1532     case 16:
1533       return &AMDGPU::SGPR_512RegClass;
1534     }
1535   }
1536 
1537   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1538   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1539              ? TRI->getAGPRClassForBitWidth(BitWidth)
1540              : TRI->getVGPRClassForBitWidth(BitWidth);
1541 }
1542 
1543 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1544     CombineInfo &CI, CombineInfo &Paired,
1545     MachineBasicBlock::iterator InsertBefore) {
1546   MachineBasicBlock *MBB = CI.I->getParent();
1547   DebugLoc DL = CI.I->getDebugLoc();
1548 
1549   const unsigned Opcode = getNewOpcode(CI, Paired);
1550 
1551   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1552   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1553   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1554 
1555   // Copy to the new source register.
1556   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1557   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1558 
1559   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1560   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1561 
1562   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1563       .add(*Src0)
1564       .addImm(SubRegIdx0)
1565       .add(*Src1)
1566       .addImm(SubRegIdx1);
1567 
1568   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1569                  .addReg(SrcReg, RegState::Kill);
1570 
1571   AddressRegs Regs = getRegs(Opcode, *TII);
1572 
1573   if (Regs.VAddr)
1574     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1575 
1576 
1577   // It shouldn't be possible to get this far if the two instructions
1578   // don't have a single memoperand, because MachineInstr::mayAlias()
1579   // will return true if this is the case.
1580   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1581 
1582   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1583   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1584 
1585   MachineInstr *New =
1586     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1587         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1588         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1589         .addImm(CI.CPol)      // cpol
1590         .addImm(0)            // tfe
1591         .addImm(0)            // swz
1592         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1593 
1594   CI.I->eraseFromParent();
1595   Paired.I->eraseFromParent();
1596   return New;
1597 }
1598 
1599 MachineOperand
1600 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1601   APInt V(32, Val, true);
1602   if (TII->isInlineConstant(V))
1603     return MachineOperand::CreateImm(Val);
1604 
1605   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1606   MachineInstr *Mov =
1607   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1608           TII->get(AMDGPU::S_MOV_B32), Reg)
1609     .addImm(Val);
1610   (void)Mov;
1611   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1612   return MachineOperand::CreateReg(Reg, false);
1613 }
1614 
1615 // Compute base address using Addr and return the final register.
1616 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1617                                            const MemAddress &Addr) const {
1618   MachineBasicBlock *MBB = MI.getParent();
1619   MachineBasicBlock::iterator MBBI = MI.getIterator();
1620   DebugLoc DL = MI.getDebugLoc();
1621 
1622   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1623           Addr.Base.LoSubReg) &&
1624          "Expected 32-bit Base-Register-Low!!");
1625 
1626   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1627           Addr.Base.HiSubReg) &&
1628          "Expected 32-bit Base-Register-Hi!!");
1629 
1630   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1631   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1632   MachineOperand OffsetHi =
1633     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1634 
1635   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1636   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1637   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1638 
1639   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1640   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1641   MachineInstr *LoHalf =
1642     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1643       .addReg(CarryReg, RegState::Define)
1644       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1645       .add(OffsetLo)
1646       .addImm(0); // clamp bit
1647   (void)LoHalf;
1648   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1649 
1650   MachineInstr *HiHalf =
1651   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1652     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1653     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1654     .add(OffsetHi)
1655     .addReg(CarryReg, RegState::Kill)
1656     .addImm(0); // clamp bit
1657   (void)HiHalf;
1658   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1659 
1660   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1661   MachineInstr *FullBase =
1662     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1663       .addReg(DestSub0)
1664       .addImm(AMDGPU::sub0)
1665       .addReg(DestSub1)
1666       .addImm(AMDGPU::sub1);
1667   (void)FullBase;
1668   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1669 
1670   return FullDestReg;
1671 }
1672 
1673 // Update base and offset with the NewBase and NewOffset in MI.
1674 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1675                                                Register NewBase,
1676                                                int32_t NewOffset) const {
1677   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1678   Base->setReg(NewBase);
1679   Base->setIsKill(false);
1680   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1681 }
1682 
1683 Optional<int32_t>
1684 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1685   if (Op.isImm())
1686     return Op.getImm();
1687 
1688   if (!Op.isReg())
1689     return None;
1690 
1691   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1692   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1693       !Def->getOperand(1).isImm())
1694     return None;
1695 
1696   return Def->getOperand(1).getImm();
1697 }
1698 
1699 // Analyze Base and extracts:
1700 //  - 32bit base registers, subregisters
1701 //  - 64bit constant offset
1702 // Expecting base computation as:
1703 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1704 //   %LO:vgpr_32, %c:sreg_64_xexec =
1705 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1706 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1707 //   %Base:vreg_64 =
1708 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1709 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1710                                                       MemAddress &Addr) const {
1711   if (!Base.isReg())
1712     return;
1713 
1714   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1715   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1716       || Def->getNumOperands() != 5)
1717     return;
1718 
1719   MachineOperand BaseLo = Def->getOperand(1);
1720   MachineOperand BaseHi = Def->getOperand(3);
1721   if (!BaseLo.isReg() || !BaseHi.isReg())
1722     return;
1723 
1724   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1725   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1726 
1727   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1728       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1729     return;
1730 
1731   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1732   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1733 
1734   auto Offset0P = extractConstOffset(*Src0);
1735   if (Offset0P)
1736     BaseLo = *Src1;
1737   else {
1738     if (!(Offset0P = extractConstOffset(*Src1)))
1739       return;
1740     BaseLo = *Src0;
1741   }
1742 
1743   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1744   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1745 
1746   if (Src0->isImm())
1747     std::swap(Src0, Src1);
1748 
1749   if (!Src1->isImm())
1750     return;
1751 
1752   uint64_t Offset1 = Src1->getImm();
1753   BaseHi = *Src0;
1754 
1755   Addr.Base.LoReg = BaseLo.getReg();
1756   Addr.Base.HiReg = BaseHi.getReg();
1757   Addr.Base.LoSubReg = BaseLo.getSubReg();
1758   Addr.Base.HiSubReg = BaseHi.getSubReg();
1759   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1760 }
1761 
1762 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1763     MachineInstr &MI,
1764     MemInfoMap &Visited,
1765     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1766 
1767   if (!(MI.mayLoad() ^ MI.mayStore()))
1768     return false;
1769 
1770   // TODO: Support flat and scratch.
1771   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1772     return false;
1773 
1774   if (MI.mayLoad() &&
1775       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1776     return false;
1777 
1778   if (AnchorList.count(&MI))
1779     return false;
1780 
1781   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1782 
1783   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1784     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1785     return false;
1786   }
1787 
1788   // Step1: Find the base-registers and a 64bit constant offset.
1789   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1790   MemAddress MAddr;
1791   if (Visited.find(&MI) == Visited.end()) {
1792     processBaseWithConstOffset(Base, MAddr);
1793     Visited[&MI] = MAddr;
1794   } else
1795     MAddr = Visited[&MI];
1796 
1797   if (MAddr.Offset == 0) {
1798     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1799                          " constant offsets that can be promoted.\n";);
1800     return false;
1801   }
1802 
1803   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1804              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1805 
1806   // Step2: Traverse through MI's basic block and find an anchor(that has the
1807   // same base-registers) with the highest 13bit distance from MI's offset.
1808   // E.g. (64bit loads)
1809   // bb:
1810   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1811   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1812   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1813   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1814   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1815   //
1816   // Starting from the first load, the optimization will try to find a new base
1817   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1818   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1819   // as the new-base(anchor) because of the maximum distance which can
1820   // accommodate more intermediate bases presumably.
1821   //
1822   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1823   // (&a + 8192) for load1, load2, load4.
1824   //   addr = &a + 8192
1825   //   load1 = load(addr,       -4096)
1826   //   load2 = load(addr,       -2048)
1827   //   load3 = load(addr,       0)
1828   //   load4 = load(addr,       2048)
1829   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1830   //
1831   MachineInstr *AnchorInst = nullptr;
1832   MemAddress AnchorAddr;
1833   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1834   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1835 
1836   MachineBasicBlock *MBB = MI.getParent();
1837   MachineBasicBlock::iterator E = MBB->end();
1838   MachineBasicBlock::iterator MBBI = MI.getIterator();
1839   ++MBBI;
1840   const SITargetLowering *TLI =
1841     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1842 
1843   for ( ; MBBI != E; ++MBBI) {
1844     MachineInstr &MINext = *MBBI;
1845     // TODO: Support finding an anchor(with same base) from store addresses or
1846     // any other load addresses where the opcodes are different.
1847     if (MINext.getOpcode() != MI.getOpcode() ||
1848         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1849       continue;
1850 
1851     const MachineOperand &BaseNext =
1852       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1853     MemAddress MAddrNext;
1854     if (Visited.find(&MINext) == Visited.end()) {
1855       processBaseWithConstOffset(BaseNext, MAddrNext);
1856       Visited[&MINext] = MAddrNext;
1857     } else
1858       MAddrNext = Visited[&MINext];
1859 
1860     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1861         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1862         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1863         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1864       continue;
1865 
1866     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1867 
1868     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1869     TargetLoweringBase::AddrMode AM;
1870     AM.HasBaseReg = true;
1871     AM.BaseOffs = Dist;
1872     if (TLI->isLegalGlobalAddressingMode(AM) &&
1873         (uint32_t)std::abs(Dist) > MaxDist) {
1874       MaxDist = std::abs(Dist);
1875 
1876       AnchorAddr = MAddrNext;
1877       AnchorInst = &MINext;
1878     }
1879   }
1880 
1881   if (AnchorInst) {
1882     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1883                AnchorInst->dump());
1884     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1885                <<  AnchorAddr.Offset << "\n\n");
1886 
1887     // Instead of moving up, just re-compute anchor-instruction's base address.
1888     Register Base = computeBase(MI, AnchorAddr);
1889 
1890     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1891     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1892 
1893     for (auto P : InstsWCommonBase) {
1894       TargetLoweringBase::AddrMode AM;
1895       AM.HasBaseReg = true;
1896       AM.BaseOffs = P.second - AnchorAddr.Offset;
1897 
1898       if (TLI->isLegalGlobalAddressingMode(AM)) {
1899         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1900                    dbgs() << ")"; P.first->dump());
1901         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1902         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1903       }
1904     }
1905     AnchorList.insert(AnchorInst);
1906     return true;
1907   }
1908 
1909   return false;
1910 }
1911 
1912 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1913                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1914   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1915     if (AddrList.front().InstClass == CI.InstClass &&
1916         AddrList.front().IsAGPR == CI.IsAGPR &&
1917         AddrList.front().hasSameBaseAddress(*CI.I)) {
1918       AddrList.emplace_back(CI);
1919       return;
1920     }
1921   }
1922 
1923   // Base address not found, so add a new list.
1924   MergeableInsts.emplace_back(1, CI);
1925 }
1926 
1927 std::pair<MachineBasicBlock::iterator, bool>
1928 SILoadStoreOptimizer::collectMergeableInsts(
1929     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1930     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1931     std::list<std::list<CombineInfo>> &MergeableInsts) const {
1932   bool Modified = false;
1933 
1934   // Sort potential mergeable instructions into lists.  One list per base address.
1935   unsigned Order = 0;
1936   MachineBasicBlock::iterator BlockI = Begin;
1937   for (; BlockI != End; ++BlockI) {
1938     MachineInstr &MI = *BlockI;
1939 
1940     // We run this before checking if an address is mergeable, because it can produce
1941     // better code even if the instructions aren't mergeable.
1942     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1943       Modified = true;
1944 
1945     // Treat volatile accesses, ordered accesses and unmodeled side effects as
1946     // barriers. We can look after this barrier for separate merges.
1947     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
1948       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
1949 
1950       // Search will resume after this instruction in a separate merge list.
1951       ++BlockI;
1952       break;
1953     }
1954 
1955     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1956     if (InstClass == UNKNOWN)
1957       continue;
1958 
1959     // Do not merge VMEM buffer instructions with "swizzled" bit set.
1960     int Swizzled =
1961         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
1962     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
1963       continue;
1964 
1965     CombineInfo CI;
1966     CI.setMI(MI, *this);
1967     CI.Order = Order++;
1968 
1969     if (!CI.hasMergeableAddress(*MRI))
1970       continue;
1971 
1972     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
1973       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
1974       //        operands. However we are reporting that ds_write2 shall have
1975       //        only VGPR data so that machine copy propagation does not
1976       //        create an illegal instruction with a VGPR and AGPR sources.
1977       //        Consequenctially if we create such instruction the verifier
1978       //        will complain.
1979       continue;
1980     }
1981 
1982     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
1983 
1984     addInstToMergeableList(CI, MergeableInsts);
1985   }
1986 
1987   // At this point we have lists of Mergeable instructions.
1988   //
1989   // Part 2: Sort lists by offset and then for each CombineInfo object in the
1990   // list try to find an instruction that can be merged with I.  If an instruction
1991   // is found, it is stored in the Paired field.  If no instructions are found, then
1992   // the CombineInfo object is deleted from the list.
1993 
1994   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
1995                                                    E = MergeableInsts.end(); I != E;) {
1996 
1997     std::list<CombineInfo> &MergeList = *I;
1998     if (MergeList.size() <= 1) {
1999       // This means we have found only one instruction with a given address
2000       // that can be merged, and we need at least 2 instructions to do a merge,
2001       // so this list can be discarded.
2002       I = MergeableInsts.erase(I);
2003       continue;
2004     }
2005 
2006     // Sort the lists by offsets, this way mergeable instructions will be
2007     // adjacent to each other in the list, which will make it easier to find
2008     // matches.
2009     MergeList.sort(
2010         [] (const CombineInfo &A, const CombineInfo &B) {
2011           return A.Offset < B.Offset;
2012         });
2013     ++I;
2014   }
2015 
2016   return std::make_pair(BlockI, Modified);
2017 }
2018 
2019 // Scan through looking for adjacent LDS operations with constant offsets from
2020 // the same base register. We rely on the scheduler to do the hard work of
2021 // clustering nearby loads, and assume these are all adjacent.
2022 bool SILoadStoreOptimizer::optimizeBlock(
2023                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2024   bool Modified = false;
2025 
2026   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2027                                                    E = MergeableInsts.end(); I != E;) {
2028     std::list<CombineInfo> &MergeList = *I;
2029 
2030     bool OptimizeListAgain = false;
2031     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2032       // We weren't able to make any changes, so delete the list so we don't
2033       // process the same instructions the next time we try to optimize this
2034       // block.
2035       I = MergeableInsts.erase(I);
2036       continue;
2037     }
2038 
2039     Modified = true;
2040 
2041     // We made changes, but also determined that there were no more optimization
2042     // opportunities, so we don't need to reprocess the list
2043     if (!OptimizeListAgain) {
2044       I = MergeableInsts.erase(I);
2045       continue;
2046     }
2047     OptimizeAgain = true;
2048   }
2049   return Modified;
2050 }
2051 
2052 bool
2053 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2054                                           std::list<CombineInfo> &MergeList,
2055                                           bool &OptimizeListAgain) {
2056   if (MergeList.empty())
2057     return false;
2058 
2059   bool Modified = false;
2060 
2061   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2062        Next = std::next(I)) {
2063 
2064     auto First = I;
2065     auto Second = Next;
2066 
2067     if ((*First).Order > (*Second).Order)
2068       std::swap(First, Second);
2069     CombineInfo &CI = *First;
2070     CombineInfo &Paired = *Second;
2071 
2072     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2073     if (!Where) {
2074       ++I;
2075       continue;
2076     }
2077 
2078     Modified = true;
2079 
2080     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2081 
2082     MachineBasicBlock::iterator NewMI;
2083     switch (CI.InstClass) {
2084     default:
2085       llvm_unreachable("unknown InstClass");
2086       break;
2087     case DS_READ:
2088       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2089       break;
2090     case DS_WRITE:
2091       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2092       break;
2093     case S_BUFFER_LOAD_IMM:
2094       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2095       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2096       break;
2097     case BUFFER_LOAD:
2098       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2099       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2100       break;
2101     case BUFFER_STORE:
2102       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2103       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2104       break;
2105     case MIMG:
2106       NewMI = mergeImagePair(CI, Paired, Where->I);
2107       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2108       break;
2109     case TBUFFER_LOAD:
2110       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2111       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2112       break;
2113     case TBUFFER_STORE:
2114       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2115       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2116       break;
2117     case GLOBAL_LOAD:
2118       NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
2119       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2120       break;
2121     }
2122     CI.setMI(NewMI, *this);
2123     CI.Order = Where->Order;
2124     if (I == Second)
2125       I = Next;
2126 
2127     MergeList.erase(Second);
2128   }
2129 
2130   return Modified;
2131 }
2132 
2133 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2134   if (skipFunction(MF.getFunction()))
2135     return false;
2136 
2137   STM = &MF.getSubtarget<GCNSubtarget>();
2138   if (!STM->loadStoreOptEnabled())
2139     return false;
2140 
2141   TII = STM->getInstrInfo();
2142   TRI = &TII->getRegisterInfo();
2143 
2144   MRI = &MF.getRegInfo();
2145   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2146 
2147   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2148 
2149   bool Modified = false;
2150 
2151   // Contains the list of instructions for which constant offsets are being
2152   // promoted to the IMM. This is tracked for an entire block at time.
2153   SmallPtrSet<MachineInstr *, 4> AnchorList;
2154   MemInfoMap Visited;
2155 
2156   for (MachineBasicBlock &MBB : MF) {
2157     MachineBasicBlock::iterator SectionEnd;
2158     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2159          I = SectionEnd) {
2160       bool CollectModified;
2161       std::list<std::list<CombineInfo>> MergeableInsts;
2162 
2163       // First pass: Collect list of all instructions we know how to merge in a
2164       // subset of the block.
2165       std::tie(SectionEnd, CollectModified) =
2166           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2167 
2168       Modified |= CollectModified;
2169 
2170       do {
2171         OptimizeAgain = false;
2172         Modified |= optimizeBlock(MergeableInsts);
2173       } while (OptimizeAgain);
2174     }
2175 
2176     Visited.clear();
2177     AnchorList.clear();
2178   }
2179 
2180   return Modified;
2181 }
2182