1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82   GLOBAL_LOAD,
83   GLOBAL_LOAD_SADDR,
84   GLOBAL_STORE,
85   GLOBAL_STORE_SADDR
86 };
87 
88 struct AddressRegs {
89   unsigned char NumVAddrs = 0;
90   bool SBase = false;
91   bool SRsrc = false;
92   bool SOffset = false;
93   bool SAddr = false;
94   bool VAddr = false;
95   bool Addr = false;
96   bool SSamp = false;
97 };
98 
99 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
100 const unsigned MaxAddressRegs = 12 + 1 + 1;
101 
102 class SILoadStoreOptimizer : public MachineFunctionPass {
103   struct CombineInfo {
104     MachineBasicBlock::iterator I;
105     unsigned EltSize;
106     unsigned Offset;
107     unsigned Width;
108     unsigned Format;
109     unsigned BaseOff;
110     unsigned DMask;
111     InstClassEnum InstClass;
112     unsigned CPol = 0;
113     bool IsAGPR;
114     bool UseST64;
115     int AddrIdx[MaxAddressRegs];
116     const MachineOperand *AddrReg[MaxAddressRegs];
117     unsigned NumAddresses;
118     unsigned Order;
119 
120     bool hasSameBaseAddress(const MachineInstr &MI) {
121       for (unsigned i = 0; i < NumAddresses; i++) {
122         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
123 
124         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
125           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
126               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
127             return false;
128           }
129           continue;
130         }
131 
132         // Check same base pointer. Be careful of subregisters, which can occur
133         // with vectors of pointers.
134         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
135             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
136          return false;
137         }
138       }
139       return true;
140     }
141 
142     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
143       for (unsigned i = 0; i < NumAddresses; ++i) {
144         const MachineOperand *AddrOp = AddrReg[i];
145         // Immediates are always OK.
146         if (AddrOp->isImm())
147           continue;
148 
149         // Don't try to merge addresses that aren't either immediates or registers.
150         // TODO: Should be possible to merge FrameIndexes and maybe some other
151         // non-register
152         if (!AddrOp->isReg())
153           return false;
154 
155         // TODO: We should be able to merge physical reg addresses.
156         if (AddrOp->getReg().isPhysical())
157           return false;
158 
159         // If an address has only one use then there will be on other
160         // instructions with the same address, so we can't merge this one.
161         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
162           return false;
163       }
164       return true;
165     }
166 
167     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
168 
169     // Compare by pointer order.
170     bool operator<(const CombineInfo& Other) const {
171       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
172     }
173   };
174 
175   struct BaseRegisters {
176     Register LoReg;
177     Register HiReg;
178 
179     unsigned LoSubReg = 0;
180     unsigned HiSubReg = 0;
181   };
182 
183   struct MemAddress {
184     BaseRegisters Base;
185     int64_t Offset = 0;
186   };
187 
188   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
189 
190 private:
191   const GCNSubtarget *STM = nullptr;
192   const SIInstrInfo *TII = nullptr;
193   const SIRegisterInfo *TRI = nullptr;
194   MachineRegisterInfo *MRI = nullptr;
195   AliasAnalysis *AA = nullptr;
196   bool OptimizeAgain;
197 
198   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
199                            const DenseSet<Register> &ARegUses,
200                            const MachineInstr &A, const MachineInstr &B) const;
201   static bool dmasksCanBeCombined(const CombineInfo &CI,
202                                   const SIInstrInfo &TII,
203                                   const CombineInfo &Paired);
204   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
205                                    CombineInfo &Paired, bool Modify = false);
206   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
207                         const CombineInfo &Paired);
208   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
209   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
210                                                      const CombineInfo &Paired);
211   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
212                                                     const CombineInfo &Paired);
213   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
214 
215   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
216 
217   unsigned read2Opcode(unsigned EltSize) const;
218   unsigned read2ST64Opcode(unsigned EltSize) const;
219   MachineBasicBlock::iterator
220   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
221                  MachineBasicBlock::iterator InsertBefore);
222 
223   unsigned write2Opcode(unsigned EltSize) const;
224   unsigned write2ST64Opcode(unsigned EltSize) const;
225   MachineBasicBlock::iterator
226   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
227                   MachineBasicBlock::iterator InsertBefore);
228   MachineBasicBlock::iterator
229   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
230                  MachineBasicBlock::iterator InsertBefore);
231   MachineBasicBlock::iterator
232   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
233                           MachineBasicBlock::iterator InsertBefore);
234   MachineBasicBlock::iterator
235   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
236                       MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
239                        MachineBasicBlock::iterator InsertBefore);
240   MachineBasicBlock::iterator
241   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
242                        MachineBasicBlock::iterator InsertBefore);
243   MachineBasicBlock::iterator
244   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
245                         MachineBasicBlock::iterator InsertBefore);
246   MachineBasicBlock::iterator
247   mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
248                       MachineBasicBlock::iterator InsertBefore);
249   MachineBasicBlock::iterator
250   mergeGlobalStorePair(CombineInfo &CI, CombineInfo &Paired,
251                        MachineBasicBlock::iterator InsertBefore);
252 
253   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
254                            int32_t NewOffset) const;
255   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
256   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
257   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
258   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
259   /// Promotes constant offset to the immediate by adjusting the base. It
260   /// tries to use a base from the nearby instructions that allows it to have
261   /// a 13bit constant offset which gets promoted to the immediate.
262   bool promoteConstantOffsetToImm(MachineInstr &CI,
263                                   MemInfoMap &Visited,
264                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
265   void addInstToMergeableList(const CombineInfo &CI,
266                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
267 
268   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
269       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
270       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
271       std::list<std::list<CombineInfo>> &MergeableInsts) const;
272 
273   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
274                                                      const CombineInfo &Paired);
275 
276 public:
277   static char ID;
278 
279   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
280     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
281   }
282 
283   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
284                                      bool &OptimizeListAgain);
285   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
286 
287   bool runOnMachineFunction(MachineFunction &MF) override;
288 
289   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
290 
291   void getAnalysisUsage(AnalysisUsage &AU) const override {
292     AU.setPreservesCFG();
293     AU.addRequired<AAResultsWrapperPass>();
294 
295     MachineFunctionPass::getAnalysisUsage(AU);
296   }
297 
298   MachineFunctionProperties getRequiredProperties() const override {
299     return MachineFunctionProperties()
300       .set(MachineFunctionProperties::Property::IsSSA);
301   }
302 };
303 
304 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
305   const unsigned Opc = MI.getOpcode();
306 
307   if (TII.isMUBUF(Opc)) {
308     // FIXME: Handle d16 correctly
309     return AMDGPU::getMUBUFElements(Opc);
310   }
311   if (TII.isMIMG(MI)) {
312     uint64_t DMaskImm =
313         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
314     return countPopulation(DMaskImm);
315   }
316   if (TII.isMTBUF(Opc)) {
317     return AMDGPU::getMTBUFElements(Opc);
318   }
319 
320   switch (Opc) {
321   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
322   case AMDGPU::GLOBAL_LOAD_DWORD:
323   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
324   case AMDGPU::GLOBAL_STORE_DWORD:
325   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
326     return 1;
327   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
328   case AMDGPU::GLOBAL_LOAD_DWORDX2:
329   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
330   case AMDGPU::GLOBAL_STORE_DWORDX2:
331   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
332     return 2;
333   case AMDGPU::GLOBAL_LOAD_DWORDX3:
334   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
335   case AMDGPU::GLOBAL_STORE_DWORDX3:
336   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
337     return 3;
338   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
339   case AMDGPU::GLOBAL_LOAD_DWORDX4:
340   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
341   case AMDGPU::GLOBAL_STORE_DWORDX4:
342   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
343     return 4;
344   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
345     return 8;
346   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
347   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
348   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
349   case AMDGPU::DS_WRITE_B32_gfx9:
350     return 1;
351   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
352   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
353   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
354   case AMDGPU::DS_WRITE_B64_gfx9:
355     return 2;
356   default:
357     return 0;
358   }
359 }
360 
361 /// Maps instruction opcode to enum InstClassEnum.
362 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
363   switch (Opc) {
364   default:
365     if (TII.isMUBUF(Opc)) {
366       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
367       default:
368         return UNKNOWN;
369       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
370       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
371       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
372       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
373         return BUFFER_LOAD;
374       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
375       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
376       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
377       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
378         return BUFFER_STORE;
379       }
380     }
381     if (TII.isMIMG(Opc)) {
382       // Ignore instructions encoded without vaddr.
383       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
384           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
385         return UNKNOWN;
386       // Ignore BVH instructions
387       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
388         return UNKNOWN;
389       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
390       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
391           TII.isGather4(Opc))
392         return UNKNOWN;
393       return MIMG;
394     }
395     if (TII.isMTBUF(Opc)) {
396       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
397       default:
398         return UNKNOWN;
399       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
400       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
401       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
402       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
403         return TBUFFER_LOAD;
404       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
405       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
406       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
407       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
408         return TBUFFER_STORE;
409       }
410     }
411     return UNKNOWN;
412   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
413   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
414   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
415   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
416     return S_BUFFER_LOAD_IMM;
417   case AMDGPU::DS_READ_B32:
418   case AMDGPU::DS_READ_B32_gfx9:
419   case AMDGPU::DS_READ_B64:
420   case AMDGPU::DS_READ_B64_gfx9:
421     return DS_READ;
422   case AMDGPU::DS_WRITE_B32:
423   case AMDGPU::DS_WRITE_B32_gfx9:
424   case AMDGPU::DS_WRITE_B64:
425   case AMDGPU::DS_WRITE_B64_gfx9:
426     return DS_WRITE;
427   case AMDGPU::GLOBAL_LOAD_DWORD:
428   case AMDGPU::GLOBAL_LOAD_DWORDX2:
429   case AMDGPU::GLOBAL_LOAD_DWORDX3:
430   case AMDGPU::GLOBAL_LOAD_DWORDX4:
431     return GLOBAL_LOAD;
432   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
433   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
434   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
435   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
436     return GLOBAL_LOAD_SADDR;
437   case AMDGPU::GLOBAL_STORE_DWORD:
438   case AMDGPU::GLOBAL_STORE_DWORDX2:
439   case AMDGPU::GLOBAL_STORE_DWORDX3:
440   case AMDGPU::GLOBAL_STORE_DWORDX4:
441     return GLOBAL_STORE;
442   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
443   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
444   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
445   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
446     return GLOBAL_STORE_SADDR;
447   }
448 }
449 
450 /// Determines instruction subclass from opcode. Only instructions
451 /// of the same subclass can be merged together. The merged instruction may have
452 /// a different subclass but must have the same class.
453 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
454   switch (Opc) {
455   default:
456     if (TII.isMUBUF(Opc))
457       return AMDGPU::getMUBUFBaseOpcode(Opc);
458     if (TII.isMIMG(Opc)) {
459       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
460       assert(Info);
461       return Info->BaseOpcode;
462     }
463     if (TII.isMTBUF(Opc))
464       return AMDGPU::getMTBUFBaseOpcode(Opc);
465     return -1;
466   case AMDGPU::DS_READ_B32:
467   case AMDGPU::DS_READ_B32_gfx9:
468   case AMDGPU::DS_READ_B64:
469   case AMDGPU::DS_READ_B64_gfx9:
470   case AMDGPU::DS_WRITE_B32:
471   case AMDGPU::DS_WRITE_B32_gfx9:
472   case AMDGPU::DS_WRITE_B64:
473   case AMDGPU::DS_WRITE_B64_gfx9:
474     return Opc;
475   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
476   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
477   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
478   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
479     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
480   case AMDGPU::GLOBAL_LOAD_DWORD:
481   case AMDGPU::GLOBAL_LOAD_DWORDX2:
482   case AMDGPU::GLOBAL_LOAD_DWORDX3:
483   case AMDGPU::GLOBAL_LOAD_DWORDX4:
484     return AMDGPU::GLOBAL_LOAD_DWORD;
485   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
486   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
487   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
488   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
489     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
490   case AMDGPU::GLOBAL_STORE_DWORD:
491   case AMDGPU::GLOBAL_STORE_DWORDX2:
492   case AMDGPU::GLOBAL_STORE_DWORDX3:
493   case AMDGPU::GLOBAL_STORE_DWORDX4:
494     return AMDGPU::GLOBAL_STORE_DWORD;
495   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
496   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
497   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
498   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
499     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
500   }
501 }
502 
503 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
504   AddressRegs Result;
505 
506   if (TII.isMUBUF(Opc)) {
507     if (AMDGPU::getMUBUFHasVAddr(Opc))
508       Result.VAddr = true;
509     if (AMDGPU::getMUBUFHasSrsrc(Opc))
510       Result.SRsrc = true;
511     if (AMDGPU::getMUBUFHasSoffset(Opc))
512       Result.SOffset = true;
513 
514     return Result;
515   }
516 
517   if (TII.isMIMG(Opc)) {
518     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
519     if (VAddr0Idx >= 0) {
520       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
521       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
522     } else {
523       Result.VAddr = true;
524     }
525     Result.SRsrc = true;
526     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
527     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
528       Result.SSamp = true;
529 
530     return Result;
531   }
532   if (TII.isMTBUF(Opc)) {
533     if (AMDGPU::getMTBUFHasVAddr(Opc))
534       Result.VAddr = true;
535     if (AMDGPU::getMTBUFHasSrsrc(Opc))
536       Result.SRsrc = true;
537     if (AMDGPU::getMTBUFHasSoffset(Opc))
538       Result.SOffset = true;
539 
540     return Result;
541   }
542 
543   switch (Opc) {
544   default:
545     return Result;
546   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
547   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
548   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
549   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
550     Result.SBase = true;
551     return Result;
552   case AMDGPU::DS_READ_B32:
553   case AMDGPU::DS_READ_B64:
554   case AMDGPU::DS_READ_B32_gfx9:
555   case AMDGPU::DS_READ_B64_gfx9:
556   case AMDGPU::DS_WRITE_B32:
557   case AMDGPU::DS_WRITE_B64:
558   case AMDGPU::DS_WRITE_B32_gfx9:
559   case AMDGPU::DS_WRITE_B64_gfx9:
560     Result.Addr = true;
561     return Result;
562   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
563   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
564   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
565   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
566   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
567   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
568   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
569   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
570     Result.SAddr = true;
571     LLVM_FALLTHROUGH;
572   case AMDGPU::GLOBAL_LOAD_DWORD:
573   case AMDGPU::GLOBAL_LOAD_DWORDX2:
574   case AMDGPU::GLOBAL_LOAD_DWORDX3:
575   case AMDGPU::GLOBAL_LOAD_DWORDX4:
576   case AMDGPU::GLOBAL_STORE_DWORD:
577   case AMDGPU::GLOBAL_STORE_DWORDX2:
578   case AMDGPU::GLOBAL_STORE_DWORDX3:
579   case AMDGPU::GLOBAL_STORE_DWORDX4:
580     Result.VAddr = true;
581     return Result;
582   }
583 }
584 
585 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
586                                               const SILoadStoreOptimizer &LSO) {
587   I = MI;
588   unsigned Opc = MI->getOpcode();
589   InstClass = getInstClass(Opc, *LSO.TII);
590 
591   if (InstClass == UNKNOWN)
592     return;
593 
594   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
595 
596   switch (InstClass) {
597   case DS_READ:
598    EltSize =
599           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
600                                                                           : 4;
601    break;
602   case DS_WRITE:
603     EltSize =
604           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
605                                                                             : 4;
606     break;
607   case S_BUFFER_LOAD_IMM:
608     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
609     break;
610   default:
611     EltSize = 4;
612     break;
613   }
614 
615   if (InstClass == MIMG) {
616     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
617     // Offset is not considered for MIMG instructions.
618     Offset = 0;
619   } else {
620     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
621     Offset = I->getOperand(OffsetIdx).getImm();
622   }
623 
624   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
625     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
626 
627   Width = getOpcodeWidth(*I, *LSO.TII);
628 
629   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
630     Offset &= 0xffff;
631   } else if (InstClass != MIMG) {
632     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
633   }
634 
635   AddressRegs Regs = getRegs(Opc, *LSO.TII);
636 
637   NumAddresses = 0;
638   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
639     AddrIdx[NumAddresses++] =
640         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
641   if (Regs.Addr)
642     AddrIdx[NumAddresses++] =
643         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
644   if (Regs.SBase)
645     AddrIdx[NumAddresses++] =
646         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
647   if (Regs.SRsrc)
648     AddrIdx[NumAddresses++] =
649         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
650   if (Regs.SOffset)
651     AddrIdx[NumAddresses++] =
652         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
653   if (Regs.SAddr)
654     AddrIdx[NumAddresses++] =
655         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
656   if (Regs.VAddr)
657     AddrIdx[NumAddresses++] =
658         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
659   if (Regs.SSamp)
660     AddrIdx[NumAddresses++] =
661         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
662   assert(NumAddresses <= MaxAddressRegs);
663 
664   for (unsigned J = 0; J < NumAddresses; J++)
665     AddrReg[J] = &I->getOperand(AddrIdx[J]);
666 }
667 
668 } // end anonymous namespace.
669 
670 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
671                       "SI Load Store Optimizer", false, false)
672 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
673 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
674                     false, false)
675 
676 char SILoadStoreOptimizer::ID = 0;
677 
678 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
679 
680 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
681   return new SILoadStoreOptimizer();
682 }
683 
684 static void addDefsUsesToList(const MachineInstr &MI,
685                               DenseSet<Register> &RegDefs,
686                               DenseSet<Register> &RegUses) {
687   for (const auto &Op : MI.operands()) {
688     if (!Op.isReg())
689       continue;
690     if (Op.isDef())
691       RegDefs.insert(Op.getReg());
692     if (Op.readsReg())
693       RegUses.insert(Op.getReg());
694   }
695 }
696 
697 bool SILoadStoreOptimizer::canSwapInstructions(
698     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
699     const MachineInstr &A, const MachineInstr &B) const {
700   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
701       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
702     return false;
703   for (const auto &BOp : B.operands()) {
704     if (!BOp.isReg())
705       continue;
706     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
707       return false;
708     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
709       return false;
710   }
711   return true;
712 }
713 
714 // Given that \p CI and \p Paired are adjacent memory operations produce a new
715 // MMO for the combined operation with a new access size.
716 MachineMemOperand *
717 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
718                                                const CombineInfo &Paired) {
719   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
720   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
721 
722   unsigned Size = MMOa->getSize() + MMOb->getSize();
723 
724   // A base pointer for the combined operation is the same as the leading
725   // operation's pointer.
726   if (Paired < CI)
727     MMOa = MMOb;
728 
729   MachineFunction *MF = CI.I->getMF();
730   return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size);
731 }
732 
733 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
734                                                const SIInstrInfo &TII,
735                                                const CombineInfo &Paired) {
736   assert(CI.InstClass == MIMG);
737 
738   // Ignore instructions with tfe/lwe set.
739   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
740   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
741 
742   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
743     return false;
744 
745   // Check other optional immediate operands for equality.
746   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
747                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
748                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
749 
750   for (auto op : OperandsToMatch) {
751     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
752     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
753       return false;
754     if (Idx != -1 &&
755         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
756       return false;
757   }
758 
759   // Check DMask for overlaps.
760   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
761   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
762 
763   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
764   if ((1u << AllowedBitsForMin) <= MinMask)
765     return false;
766 
767   return true;
768 }
769 
770 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
771                                        unsigned ComponentCount,
772                                        const GCNSubtarget &STI) {
773   if (ComponentCount > 4)
774     return 0;
775 
776   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
777       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
778   if (!OldFormatInfo)
779     return 0;
780 
781   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
782       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
783                                            ComponentCount,
784                                            OldFormatInfo->NumFormat, STI);
785 
786   if (!NewFormatInfo)
787     return 0;
788 
789   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
790          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
791 
792   return NewFormatInfo->Format;
793 }
794 
795 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
796 // highest power of two. Note that the result is well defined for all inputs
797 // including corner cases like:
798 // - if Lo == Hi, return that value
799 // - if Lo == 0, return 0 (even though the "- 1" below underflows
800 // - if Lo > Hi, return 0 (as if the range wrapped around)
801 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
802   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
803 }
804 
805 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
806                                                 const GCNSubtarget &STI,
807                                                 CombineInfo &Paired,
808                                                 bool Modify) {
809   assert(CI.InstClass != MIMG);
810 
811   // XXX - Would the same offset be OK? Is there any reason this would happen or
812   // be useful?
813   if (CI.Offset == Paired.Offset)
814     return false;
815 
816   // This won't be valid if the offset isn't aligned.
817   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
818     return false;
819 
820   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
821 
822     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
823         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
824     if (!Info0)
825       return false;
826     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
827         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
828     if (!Info1)
829       return false;
830 
831     if (Info0->BitsPerComp != Info1->BitsPerComp ||
832         Info0->NumFormat != Info1->NumFormat)
833       return false;
834 
835     // TODO: Should be possible to support more formats, but if format loads
836     // are not dword-aligned, the merged load might not be valid.
837     if (Info0->BitsPerComp != 32)
838       return false;
839 
840     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
841       return false;
842   }
843 
844   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
845   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
846   CI.UseST64 = false;
847   CI.BaseOff = 0;
848 
849   // Handle all non-DS instructions.
850   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
851     return (EltOffset0 + CI.Width == EltOffset1 ||
852             EltOffset1 + Paired.Width == EltOffset0) &&
853            CI.CPol == Paired.CPol;
854   }
855 
856   // If the offset in elements doesn't fit in 8-bits, we might be able to use
857   // the stride 64 versions.
858   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
859       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
860     if (Modify) {
861       CI.Offset = EltOffset0 / 64;
862       Paired.Offset = EltOffset1 / 64;
863       CI.UseST64 = true;
864     }
865     return true;
866   }
867 
868   // Check if the new offsets fit in the reduced 8-bit range.
869   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
870     if (Modify) {
871       CI.Offset = EltOffset0;
872       Paired.Offset = EltOffset1;
873     }
874     return true;
875   }
876 
877   // Try to shift base address to decrease offsets.
878   uint32_t Min = std::min(EltOffset0, EltOffset1);
879   uint32_t Max = std::max(EltOffset0, EltOffset1);
880 
881   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
882   if (((Max - Min) & ~Mask) == 0) {
883     if (Modify) {
884       // From the range of values we could use for BaseOff, choose the one that
885       // is aligned to the highest power of two, to maximise the chance that
886       // the same offset can be reused for other load/store pairs.
887       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
888       // Copy the low bits of the offsets, so that when we adjust them by
889       // subtracting BaseOff they will be multiples of 64.
890       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
891       CI.BaseOff = BaseOff * CI.EltSize;
892       CI.Offset = (EltOffset0 - BaseOff) / 64;
893       Paired.Offset = (EltOffset1 - BaseOff) / 64;
894       CI.UseST64 = true;
895     }
896     return true;
897   }
898 
899   if (isUInt<8>(Max - Min)) {
900     if (Modify) {
901       // From the range of values we could use for BaseOff, choose the one that
902       // is aligned to the highest power of two, to maximise the chance that
903       // the same offset can be reused for other load/store pairs.
904       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
905       CI.BaseOff = BaseOff * CI.EltSize;
906       CI.Offset = EltOffset0 - BaseOff;
907       Paired.Offset = EltOffset1 - BaseOff;
908     }
909     return true;
910   }
911 
912   return false;
913 }
914 
915 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
916                                      const CombineInfo &CI,
917                                      const CombineInfo &Paired) {
918   const unsigned Width = (CI.Width + Paired.Width);
919   switch (CI.InstClass) {
920   default:
921     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
922   case S_BUFFER_LOAD_IMM:
923     switch (Width) {
924     default:
925       return false;
926     case 2:
927     case 4:
928     case 8:
929       return true;
930     }
931   }
932 }
933 
934 const TargetRegisterClass *
935 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
936   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
937     return TRI->getRegClassForReg(*MRI, Dst->getReg());
938   }
939   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
940     return TRI->getRegClassForReg(*MRI, Src->getReg());
941   }
942   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
943     return TRI->getRegClassForReg(*MRI, Src->getReg());
944   }
945   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
946     return TRI->getRegClassForReg(*MRI, Dst->getReg());
947   }
948   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
949     return TRI->getRegClassForReg(*MRI, Src->getReg());
950   }
951   return nullptr;
952 }
953 
954 /// This function assumes that CI comes before Paired in a basic block. Return
955 /// an insertion point for the merged instruction or nullptr on failure.
956 SILoadStoreOptimizer::CombineInfo *
957 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
958                                            CombineInfo &Paired) {
959   // If another instruction has already been merged into CI, it may now be a
960   // type that we can't do any further merging into.
961   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
962     return nullptr;
963   assert(CI.InstClass == Paired.InstClass);
964 
965   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
966       getInstSubclass(Paired.I->getOpcode(), *TII))
967     return nullptr;
968 
969   // Check both offsets (or masks for MIMG) can be combined and fit in the
970   // reduced range.
971   if (CI.InstClass == MIMG) {
972     if (!dmasksCanBeCombined(CI, *TII, Paired))
973       return nullptr;
974   } else {
975     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
976       return nullptr;
977   }
978 
979   DenseSet<Register> RegDefs;
980   DenseSet<Register> RegUses;
981   CombineInfo *Where;
982   if (CI.I->mayLoad()) {
983     // Try to hoist Paired up to CI.
984     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
985     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
986       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
987         return nullptr;
988     }
989     Where = &CI;
990   } else {
991     // Try to sink CI down to Paired.
992     addDefsUsesToList(*CI.I, RegDefs, RegUses);
993     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
994       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
995         return nullptr;
996     }
997     Where = &Paired;
998   }
999 
1000   // Call offsetsCanBeCombined with modify = true so that the offsets are
1001   // correct for the new instruction.  This should return true, because
1002   // this function should only be called on CombineInfo objects that
1003   // have already been confirmed to be mergeable.
1004   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1005     offsetsCanBeCombined(CI, *STM, Paired, true);
1006   return Where;
1007 }
1008 
1009 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1010   if (STM->ldsRequiresM0Init())
1011     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1012   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1013 }
1014 
1015 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1016   if (STM->ldsRequiresM0Init())
1017     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1018 
1019   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1020                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1021 }
1022 
1023 MachineBasicBlock::iterator
1024 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1025                                      MachineBasicBlock::iterator InsertBefore) {
1026   MachineBasicBlock *MBB = CI.I->getParent();
1027 
1028   // Be careful, since the addresses could be subregisters themselves in weird
1029   // cases, like vectors of pointers.
1030   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1031 
1032   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1033   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1034 
1035   unsigned NewOffset0 = CI.Offset;
1036   unsigned NewOffset1 = Paired.Offset;
1037   unsigned Opc =
1038       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1039 
1040   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1041   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1042 
1043   if (NewOffset0 > NewOffset1) {
1044     // Canonicalize the merged instruction so the smaller offset comes first.
1045     std::swap(NewOffset0, NewOffset1);
1046     std::swap(SubRegIdx0, SubRegIdx1);
1047   }
1048 
1049   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1050          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1051 
1052   const MCInstrDesc &Read2Desc = TII->get(Opc);
1053 
1054   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1055   Register DestReg = MRI->createVirtualRegister(SuperRC);
1056 
1057   DebugLoc DL = CI.I->getDebugLoc();
1058 
1059   Register BaseReg = AddrReg->getReg();
1060   unsigned BaseSubReg = AddrReg->getSubReg();
1061   unsigned BaseRegFlags = 0;
1062   if (CI.BaseOff) {
1063     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1064     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1065         .addImm(CI.BaseOff);
1066 
1067     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1068     BaseRegFlags = RegState::Kill;
1069 
1070     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1071         .addReg(ImmReg)
1072         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1073         .addImm(0); // clamp bit
1074     BaseSubReg = 0;
1075   }
1076 
1077   MachineInstrBuilder Read2 =
1078       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1079           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1080           .addImm(NewOffset0)                        // offset0
1081           .addImm(NewOffset1)                        // offset1
1082           .addImm(0)                                 // gds
1083           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1084 
1085   (void)Read2;
1086 
1087   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1088 
1089   // Copy to the old destination registers.
1090   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1091       .add(*Dest0) // Copy to same destination including flags and sub reg.
1092       .addReg(DestReg, 0, SubRegIdx0);
1093   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1094       .add(*Dest1)
1095       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1096 
1097   CI.I->eraseFromParent();
1098   Paired.I->eraseFromParent();
1099 
1100   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1101   return Read2;
1102 }
1103 
1104 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1105   if (STM->ldsRequiresM0Init())
1106     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1107   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1108                         : AMDGPU::DS_WRITE2_B64_gfx9;
1109 }
1110 
1111 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1112   if (STM->ldsRequiresM0Init())
1113     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1114                           : AMDGPU::DS_WRITE2ST64_B64;
1115 
1116   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1117                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1118 }
1119 
1120 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1121     CombineInfo &CI, CombineInfo &Paired,
1122     MachineBasicBlock::iterator InsertBefore) {
1123   MachineBasicBlock *MBB = CI.I->getParent();
1124 
1125   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1126   // sure we preserve the subregister index and any register flags set on them.
1127   const MachineOperand *AddrReg =
1128       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1129   const MachineOperand *Data0 =
1130       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1131   const MachineOperand *Data1 =
1132       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1133 
1134   unsigned NewOffset0 = CI.Offset;
1135   unsigned NewOffset1 = Paired.Offset;
1136   unsigned Opc =
1137       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1138 
1139   if (NewOffset0 > NewOffset1) {
1140     // Canonicalize the merged instruction so the smaller offset comes first.
1141     std::swap(NewOffset0, NewOffset1);
1142     std::swap(Data0, Data1);
1143   }
1144 
1145   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1146          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1147 
1148   const MCInstrDesc &Write2Desc = TII->get(Opc);
1149   DebugLoc DL = CI.I->getDebugLoc();
1150 
1151   Register BaseReg = AddrReg->getReg();
1152   unsigned BaseSubReg = AddrReg->getSubReg();
1153   unsigned BaseRegFlags = 0;
1154   if (CI.BaseOff) {
1155     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1156     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1157         .addImm(CI.BaseOff);
1158 
1159     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1160     BaseRegFlags = RegState::Kill;
1161 
1162     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1163         .addReg(ImmReg)
1164         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1165         .addImm(0); // clamp bit
1166     BaseSubReg = 0;
1167   }
1168 
1169   MachineInstrBuilder Write2 =
1170       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1171           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1172           .add(*Data0)                               // data0
1173           .add(*Data1)                               // data1
1174           .addImm(NewOffset0)                        // offset0
1175           .addImm(NewOffset1)                        // offset1
1176           .addImm(0)                                 // gds
1177           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1178 
1179   CI.I->eraseFromParent();
1180   Paired.I->eraseFromParent();
1181 
1182   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1183   return Write2;
1184 }
1185 
1186 MachineBasicBlock::iterator
1187 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1188                                      MachineBasicBlock::iterator InsertBefore) {
1189   MachineBasicBlock *MBB = CI.I->getParent();
1190   DebugLoc DL = CI.I->getDebugLoc();
1191   const unsigned Opcode = getNewOpcode(CI, Paired);
1192 
1193   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1194 
1195   Register DestReg = MRI->createVirtualRegister(SuperRC);
1196   unsigned MergedDMask = CI.DMask | Paired.DMask;
1197   unsigned DMaskIdx =
1198       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1199 
1200   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1201   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1202     if (I == DMaskIdx)
1203       MIB.addImm(MergedDMask);
1204     else
1205       MIB.add((*CI.I).getOperand(I));
1206   }
1207 
1208   // It shouldn't be possible to get this far if the two instructions
1209   // don't have a single memoperand, because MachineInstr::mayAlias()
1210   // will return true if this is the case.
1211   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1212 
1213   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1214 
1215   unsigned SubRegIdx0, SubRegIdx1;
1216   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1217 
1218   // Copy to the old destination registers.
1219   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1220   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1221   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1222 
1223   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1224       .add(*Dest0) // Copy to same destination including flags and sub reg.
1225       .addReg(DestReg, 0, SubRegIdx0);
1226   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1227       .add(*Dest1)
1228       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1229 
1230   CI.I->eraseFromParent();
1231   Paired.I->eraseFromParent();
1232   return New;
1233 }
1234 
1235 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1236     CombineInfo &CI, CombineInfo &Paired,
1237     MachineBasicBlock::iterator InsertBefore) {
1238   MachineBasicBlock *MBB = CI.I->getParent();
1239   DebugLoc DL = CI.I->getDebugLoc();
1240   const unsigned Opcode = getNewOpcode(CI, Paired);
1241 
1242   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1243 
1244   Register DestReg = MRI->createVirtualRegister(SuperRC);
1245   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1246 
1247   // It shouldn't be possible to get this far if the two instructions
1248   // don't have a single memoperand, because MachineInstr::mayAlias()
1249   // will return true if this is the case.
1250   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1251 
1252   MachineInstr *New =
1253       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1254           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1255           .addImm(MergedOffset) // offset
1256           .addImm(CI.CPol)      // cpol
1257           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1258 
1259   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1260   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1261   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1262 
1263   // Copy to the old destination registers.
1264   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1265   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1266   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1267 
1268   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1269       .add(*Dest0) // Copy to same destination including flags and sub reg.
1270       .addReg(DestReg, 0, SubRegIdx0);
1271   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1272       .add(*Dest1)
1273       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1274 
1275   CI.I->eraseFromParent();
1276   Paired.I->eraseFromParent();
1277   return New;
1278 }
1279 
1280 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1281     CombineInfo &CI, CombineInfo &Paired,
1282     MachineBasicBlock::iterator InsertBefore) {
1283   MachineBasicBlock *MBB = CI.I->getParent();
1284   DebugLoc DL = CI.I->getDebugLoc();
1285 
1286   const unsigned Opcode = getNewOpcode(CI, Paired);
1287 
1288   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1289 
1290   // Copy to the new source register.
1291   Register DestReg = MRI->createVirtualRegister(SuperRC);
1292   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1293 
1294   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1295 
1296   AddressRegs Regs = getRegs(Opcode, *TII);
1297 
1298   if (Regs.VAddr)
1299     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1300 
1301   // It shouldn't be possible to get this far if the two instructions
1302   // don't have a single memoperand, because MachineInstr::mayAlias()
1303   // will return true if this is the case.
1304   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1305 
1306   MachineInstr *New =
1307     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1308         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1309         .addImm(MergedOffset) // offset
1310         .addImm(CI.CPol)      // cpol
1311         .addImm(0)            // tfe
1312         .addImm(0)            // swz
1313         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1314 
1315   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1316   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1317   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1318 
1319   // Copy to the old destination registers.
1320   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1321   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1322   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1323 
1324   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1325       .add(*Dest0) // Copy to same destination including flags and sub reg.
1326       .addReg(DestReg, 0, SubRegIdx0);
1327   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1328       .add(*Dest1)
1329       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1330 
1331   CI.I->eraseFromParent();
1332   Paired.I->eraseFromParent();
1333   return New;
1334 }
1335 
1336 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1337     CombineInfo &CI, CombineInfo &Paired,
1338     MachineBasicBlock::iterator InsertBefore) {
1339   MachineBasicBlock *MBB = CI.I->getParent();
1340   DebugLoc DL = CI.I->getDebugLoc();
1341 
1342   const unsigned Opcode = getNewOpcode(CI, Paired);
1343 
1344   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1345 
1346   // Copy to the new source register.
1347   Register DestReg = MRI->createVirtualRegister(SuperRC);
1348   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1349 
1350   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1351 
1352   AddressRegs Regs = getRegs(Opcode, *TII);
1353 
1354   if (Regs.VAddr)
1355     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1356 
1357   unsigned JoinedFormat =
1358       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1359 
1360   // It shouldn't be possible to get this far if the two instructions
1361   // don't have a single memoperand, because MachineInstr::mayAlias()
1362   // will return true if this is the case.
1363   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1364 
1365   MachineInstr *New =
1366       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1367           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1368           .addImm(MergedOffset) // offset
1369           .addImm(JoinedFormat) // format
1370           .addImm(CI.CPol)      // cpol
1371           .addImm(0)            // tfe
1372           .addImm(0)            // swz
1373           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1374 
1375   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1376   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1377   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1378 
1379   // Copy to the old destination registers.
1380   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1381   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1382   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1383 
1384   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1385       .add(*Dest0) // Copy to same destination including flags and sub reg.
1386       .addReg(DestReg, 0, SubRegIdx0);
1387   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1388       .add(*Dest1)
1389       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1390 
1391   CI.I->eraseFromParent();
1392   Paired.I->eraseFromParent();
1393   return New;
1394 }
1395 
1396 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1397     CombineInfo &CI, CombineInfo &Paired,
1398     MachineBasicBlock::iterator InsertBefore) {
1399   MachineBasicBlock *MBB = CI.I->getParent();
1400   DebugLoc DL = CI.I->getDebugLoc();
1401 
1402   const unsigned Opcode = getNewOpcode(CI, Paired);
1403 
1404   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1405   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1406   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1407 
1408   // Copy to the new source register.
1409   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1410   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1411 
1412   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1413   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1414 
1415   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1416       .add(*Src0)
1417       .addImm(SubRegIdx0)
1418       .add(*Src1)
1419       .addImm(SubRegIdx1);
1420 
1421   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1422                  .addReg(SrcReg, RegState::Kill);
1423 
1424   AddressRegs Regs = getRegs(Opcode, *TII);
1425 
1426   if (Regs.VAddr)
1427     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1428 
1429   unsigned JoinedFormat =
1430       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1431 
1432   // It shouldn't be possible to get this far if the two instructions
1433   // don't have a single memoperand, because MachineInstr::mayAlias()
1434   // will return true if this is the case.
1435   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1436 
1437   MachineInstr *New =
1438       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1439           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1440           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1441           .addImm(JoinedFormat)                     // format
1442           .addImm(CI.CPol)                          // cpol
1443           .addImm(0)                                // tfe
1444           .addImm(0)                                // swz
1445           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1446 
1447   CI.I->eraseFromParent();
1448   Paired.I->eraseFromParent();
1449   return New;
1450 }
1451 
1452 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
1453     CombineInfo &CI, CombineInfo &Paired,
1454     MachineBasicBlock::iterator InsertBefore) {
1455   MachineBasicBlock *MBB = CI.I->getParent();
1456   DebugLoc DL = CI.I->getDebugLoc();
1457 
1458   const unsigned Opcode = getNewOpcode(CI, Paired);
1459 
1460   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1461   Register DestReg = MRI->createVirtualRegister(SuperRC);
1462 
1463   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1464 
1465   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1466     MIB.add(*SAddr);
1467 
1468   MachineInstr *New =
1469     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1470        .addImm(std::min(CI.Offset, Paired.Offset))
1471        .addImm(CI.CPol)
1472        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1473 
1474   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1475   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1476   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1477 
1478   // Copy to the old destination registers.
1479   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1480   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1481   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1482 
1483   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1484       .add(*Dest0) // Copy to same destination including flags and sub reg.
1485       .addReg(DestReg, 0, SubRegIdx0);
1486   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1487       .add(*Dest1)
1488       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1489 
1490   CI.I->eraseFromParent();
1491   Paired.I->eraseFromParent();
1492   return New;
1493 }
1494 
1495 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalStorePair(
1496     CombineInfo &CI, CombineInfo &Paired,
1497     MachineBasicBlock::iterator InsertBefore) {
1498   MachineBasicBlock *MBB = CI.I->getParent();
1499   DebugLoc DL = CI.I->getDebugLoc();
1500 
1501   const unsigned Opcode = getNewOpcode(CI, Paired);
1502 
1503   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1504   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1505   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1506 
1507   // Copy to the new source register.
1508   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1509   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1510 
1511   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1512   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1513 
1514   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1515       .add(*Src0)
1516       .addImm(SubRegIdx0)
1517       .add(*Src1)
1518       .addImm(SubRegIdx1);
1519 
1520   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1521                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1522                  .addReg(SrcReg, RegState::Kill);
1523 
1524   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1525     MIB.add(*SAddr);
1526 
1527   MachineInstr *New =
1528     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1529        .addImm(CI.CPol)
1530        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1531 
1532   CI.I->eraseFromParent();
1533   Paired.I->eraseFromParent();
1534   return New;
1535 }
1536 
1537 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1538                                             const CombineInfo &Paired) {
1539   const unsigned Width = CI.Width + Paired.Width;
1540 
1541   switch (CI.InstClass) {
1542   default:
1543     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1544     // FIXME: Handle d16 correctly
1545     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1546                                   Width);
1547   case TBUFFER_LOAD:
1548   case TBUFFER_STORE:
1549     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1550                                   Width);
1551 
1552   case UNKNOWN:
1553     llvm_unreachable("Unknown instruction class");
1554   case S_BUFFER_LOAD_IMM:
1555     switch (Width) {
1556     default:
1557       return 0;
1558     case 2:
1559       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1560     case 4:
1561       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1562     case 8:
1563       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1564     }
1565   case GLOBAL_LOAD:
1566     switch (Width) {
1567     default:
1568       return 0;
1569     case 2:
1570       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1571     case 3:
1572       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1573     case 4:
1574       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1575     }
1576   case GLOBAL_LOAD_SADDR:
1577     switch (Width) {
1578     default:
1579       return 0;
1580     case 2:
1581       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1582     case 3:
1583       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1584     case 4:
1585       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1586     }
1587   case GLOBAL_STORE:
1588     switch (Width) {
1589     default:
1590       return 0;
1591     case 2:
1592       return AMDGPU::GLOBAL_STORE_DWORDX2;
1593     case 3:
1594       return AMDGPU::GLOBAL_STORE_DWORDX3;
1595     case 4:
1596       return AMDGPU::GLOBAL_STORE_DWORDX4;
1597     }
1598   case GLOBAL_STORE_SADDR:
1599     switch (Width) {
1600     default:
1601       return 0;
1602     case 2:
1603       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1604     case 3:
1605       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1606     case 4:
1607       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1608     }
1609   case MIMG:
1610     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1611            "No overlaps");
1612     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1613   }
1614 }
1615 
1616 std::pair<unsigned, unsigned>
1617 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1618                                     const CombineInfo &Paired) {
1619   assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1620                                    CI.Width + Paired.Width)) &&
1621          "No overlaps");
1622 
1623   unsigned Idx0;
1624   unsigned Idx1;
1625 
1626   static const unsigned Idxs[5][4] = {
1627       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1628       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1629       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1630       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1631       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1632   };
1633 
1634   assert(CI.Width >= 1 && CI.Width <= 4);
1635   assert(Paired.Width >= 1 && Paired.Width <= 4);
1636 
1637   if (Paired < CI) {
1638     Idx1 = Idxs[0][Paired.Width - 1];
1639     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1640   } else {
1641     Idx0 = Idxs[0][CI.Width - 1];
1642     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1643   }
1644 
1645   return std::make_pair(Idx0, Idx1);
1646 }
1647 
1648 const TargetRegisterClass *
1649 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1650                                              const CombineInfo &Paired) {
1651   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1652     switch (CI.Width + Paired.Width) {
1653     default:
1654       return nullptr;
1655     case 2:
1656       return &AMDGPU::SReg_64_XEXECRegClass;
1657     case 4:
1658       return &AMDGPU::SGPR_128RegClass;
1659     case 8:
1660       return &AMDGPU::SGPR_256RegClass;
1661     case 16:
1662       return &AMDGPU::SGPR_512RegClass;
1663     }
1664   }
1665 
1666   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1667   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1668              ? TRI->getAGPRClassForBitWidth(BitWidth)
1669              : TRI->getVGPRClassForBitWidth(BitWidth);
1670 }
1671 
1672 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1673     CombineInfo &CI, CombineInfo &Paired,
1674     MachineBasicBlock::iterator InsertBefore) {
1675   MachineBasicBlock *MBB = CI.I->getParent();
1676   DebugLoc DL = CI.I->getDebugLoc();
1677 
1678   const unsigned Opcode = getNewOpcode(CI, Paired);
1679 
1680   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1681   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1682   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1683 
1684   // Copy to the new source register.
1685   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1686   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1687 
1688   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1689   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1690 
1691   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1692       .add(*Src0)
1693       .addImm(SubRegIdx0)
1694       .add(*Src1)
1695       .addImm(SubRegIdx1);
1696 
1697   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1698                  .addReg(SrcReg, RegState::Kill);
1699 
1700   AddressRegs Regs = getRegs(Opcode, *TII);
1701 
1702   if (Regs.VAddr)
1703     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1704 
1705 
1706   // It shouldn't be possible to get this far if the two instructions
1707   // don't have a single memoperand, because MachineInstr::mayAlias()
1708   // will return true if this is the case.
1709   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1710 
1711   MachineInstr *New =
1712     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1713         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1714         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1715         .addImm(CI.CPol)      // cpol
1716         .addImm(0)            // tfe
1717         .addImm(0)            // swz
1718         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1719 
1720   CI.I->eraseFromParent();
1721   Paired.I->eraseFromParent();
1722   return New;
1723 }
1724 
1725 MachineOperand
1726 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1727   APInt V(32, Val, true);
1728   if (TII->isInlineConstant(V))
1729     return MachineOperand::CreateImm(Val);
1730 
1731   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1732   MachineInstr *Mov =
1733   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1734           TII->get(AMDGPU::S_MOV_B32), Reg)
1735     .addImm(Val);
1736   (void)Mov;
1737   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1738   return MachineOperand::CreateReg(Reg, false);
1739 }
1740 
1741 // Compute base address using Addr and return the final register.
1742 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1743                                            const MemAddress &Addr) const {
1744   MachineBasicBlock *MBB = MI.getParent();
1745   MachineBasicBlock::iterator MBBI = MI.getIterator();
1746   DebugLoc DL = MI.getDebugLoc();
1747 
1748   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1749           Addr.Base.LoSubReg) &&
1750          "Expected 32-bit Base-Register-Low!!");
1751 
1752   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1753           Addr.Base.HiSubReg) &&
1754          "Expected 32-bit Base-Register-Hi!!");
1755 
1756   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1757   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1758   MachineOperand OffsetHi =
1759     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1760 
1761   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1762   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1763   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1764 
1765   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1766   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1767   MachineInstr *LoHalf =
1768     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1769       .addReg(CarryReg, RegState::Define)
1770       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1771       .add(OffsetLo)
1772       .addImm(0); // clamp bit
1773   (void)LoHalf;
1774   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1775 
1776   MachineInstr *HiHalf =
1777   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1778     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1779     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1780     .add(OffsetHi)
1781     .addReg(CarryReg, RegState::Kill)
1782     .addImm(0); // clamp bit
1783   (void)HiHalf;
1784   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1785 
1786   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1787   MachineInstr *FullBase =
1788     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1789       .addReg(DestSub0)
1790       .addImm(AMDGPU::sub0)
1791       .addReg(DestSub1)
1792       .addImm(AMDGPU::sub1);
1793   (void)FullBase;
1794   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1795 
1796   return FullDestReg;
1797 }
1798 
1799 // Update base and offset with the NewBase and NewOffset in MI.
1800 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1801                                                Register NewBase,
1802                                                int32_t NewOffset) const {
1803   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1804   Base->setReg(NewBase);
1805   Base->setIsKill(false);
1806   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1807 }
1808 
1809 Optional<int32_t>
1810 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1811   if (Op.isImm())
1812     return Op.getImm();
1813 
1814   if (!Op.isReg())
1815     return None;
1816 
1817   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1818   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1819       !Def->getOperand(1).isImm())
1820     return None;
1821 
1822   return Def->getOperand(1).getImm();
1823 }
1824 
1825 // Analyze Base and extracts:
1826 //  - 32bit base registers, subregisters
1827 //  - 64bit constant offset
1828 // Expecting base computation as:
1829 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1830 //   %LO:vgpr_32, %c:sreg_64_xexec =
1831 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1832 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1833 //   %Base:vreg_64 =
1834 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1835 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1836                                                       MemAddress &Addr) const {
1837   if (!Base.isReg())
1838     return;
1839 
1840   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1841   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1842       || Def->getNumOperands() != 5)
1843     return;
1844 
1845   MachineOperand BaseLo = Def->getOperand(1);
1846   MachineOperand BaseHi = Def->getOperand(3);
1847   if (!BaseLo.isReg() || !BaseHi.isReg())
1848     return;
1849 
1850   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1851   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1852 
1853   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1854       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1855     return;
1856 
1857   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1858   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1859 
1860   auto Offset0P = extractConstOffset(*Src0);
1861   if (Offset0P)
1862     BaseLo = *Src1;
1863   else {
1864     if (!(Offset0P = extractConstOffset(*Src1)))
1865       return;
1866     BaseLo = *Src0;
1867   }
1868 
1869   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1870   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1871 
1872   if (Src0->isImm())
1873     std::swap(Src0, Src1);
1874 
1875   if (!Src1->isImm())
1876     return;
1877 
1878   uint64_t Offset1 = Src1->getImm();
1879   BaseHi = *Src0;
1880 
1881   Addr.Base.LoReg = BaseLo.getReg();
1882   Addr.Base.HiReg = BaseHi.getReg();
1883   Addr.Base.LoSubReg = BaseLo.getSubReg();
1884   Addr.Base.HiSubReg = BaseHi.getSubReg();
1885   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1886 }
1887 
1888 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1889     MachineInstr &MI,
1890     MemInfoMap &Visited,
1891     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1892 
1893   if (!(MI.mayLoad() ^ MI.mayStore()))
1894     return false;
1895 
1896   // TODO: Support flat and scratch.
1897   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1898     return false;
1899 
1900   if (MI.mayLoad() &&
1901       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1902     return false;
1903 
1904   if (AnchorList.count(&MI))
1905     return false;
1906 
1907   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1908 
1909   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1910     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1911     return false;
1912   }
1913 
1914   // Step1: Find the base-registers and a 64bit constant offset.
1915   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1916   MemAddress MAddr;
1917   if (Visited.find(&MI) == Visited.end()) {
1918     processBaseWithConstOffset(Base, MAddr);
1919     Visited[&MI] = MAddr;
1920   } else
1921     MAddr = Visited[&MI];
1922 
1923   if (MAddr.Offset == 0) {
1924     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1925                          " constant offsets that can be promoted.\n";);
1926     return false;
1927   }
1928 
1929   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1930              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1931 
1932   // Step2: Traverse through MI's basic block and find an anchor(that has the
1933   // same base-registers) with the highest 13bit distance from MI's offset.
1934   // E.g. (64bit loads)
1935   // bb:
1936   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1937   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1938   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1939   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1940   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1941   //
1942   // Starting from the first load, the optimization will try to find a new base
1943   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1944   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1945   // as the new-base(anchor) because of the maximum distance which can
1946   // accommodate more intermediate bases presumably.
1947   //
1948   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1949   // (&a + 8192) for load1, load2, load4.
1950   //   addr = &a + 8192
1951   //   load1 = load(addr,       -4096)
1952   //   load2 = load(addr,       -2048)
1953   //   load3 = load(addr,       0)
1954   //   load4 = load(addr,       2048)
1955   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1956   //
1957   MachineInstr *AnchorInst = nullptr;
1958   MemAddress AnchorAddr;
1959   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1960   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1961 
1962   MachineBasicBlock *MBB = MI.getParent();
1963   MachineBasicBlock::iterator E = MBB->end();
1964   MachineBasicBlock::iterator MBBI = MI.getIterator();
1965   ++MBBI;
1966   const SITargetLowering *TLI =
1967     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1968 
1969   for ( ; MBBI != E; ++MBBI) {
1970     MachineInstr &MINext = *MBBI;
1971     // TODO: Support finding an anchor(with same base) from store addresses or
1972     // any other load addresses where the opcodes are different.
1973     if (MINext.getOpcode() != MI.getOpcode() ||
1974         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1975       continue;
1976 
1977     const MachineOperand &BaseNext =
1978       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1979     MemAddress MAddrNext;
1980     if (Visited.find(&MINext) == Visited.end()) {
1981       processBaseWithConstOffset(BaseNext, MAddrNext);
1982       Visited[&MINext] = MAddrNext;
1983     } else
1984       MAddrNext = Visited[&MINext];
1985 
1986     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1987         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1988         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1989         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1990       continue;
1991 
1992     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1993 
1994     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1995     TargetLoweringBase::AddrMode AM;
1996     AM.HasBaseReg = true;
1997     AM.BaseOffs = Dist;
1998     if (TLI->isLegalGlobalAddressingMode(AM) &&
1999         (uint32_t)std::abs(Dist) > MaxDist) {
2000       MaxDist = std::abs(Dist);
2001 
2002       AnchorAddr = MAddrNext;
2003       AnchorInst = &MINext;
2004     }
2005   }
2006 
2007   if (AnchorInst) {
2008     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2009                AnchorInst->dump());
2010     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2011                <<  AnchorAddr.Offset << "\n\n");
2012 
2013     // Instead of moving up, just re-compute anchor-instruction's base address.
2014     Register Base = computeBase(MI, AnchorAddr);
2015 
2016     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2017     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2018 
2019     for (auto P : InstsWCommonBase) {
2020       TargetLoweringBase::AddrMode AM;
2021       AM.HasBaseReg = true;
2022       AM.BaseOffs = P.second - AnchorAddr.Offset;
2023 
2024       if (TLI->isLegalGlobalAddressingMode(AM)) {
2025         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2026                    dbgs() << ")"; P.first->dump());
2027         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2028         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2029       }
2030     }
2031     AnchorList.insert(AnchorInst);
2032     return true;
2033   }
2034 
2035   return false;
2036 }
2037 
2038 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2039                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2040   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2041     if (AddrList.front().InstClass == CI.InstClass &&
2042         AddrList.front().IsAGPR == CI.IsAGPR &&
2043         AddrList.front().hasSameBaseAddress(*CI.I)) {
2044       AddrList.emplace_back(CI);
2045       return;
2046     }
2047   }
2048 
2049   // Base address not found, so add a new list.
2050   MergeableInsts.emplace_back(1, CI);
2051 }
2052 
2053 std::pair<MachineBasicBlock::iterator, bool>
2054 SILoadStoreOptimizer::collectMergeableInsts(
2055     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2056     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2057     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2058   bool Modified = false;
2059 
2060   // Sort potential mergeable instructions into lists.  One list per base address.
2061   unsigned Order = 0;
2062   MachineBasicBlock::iterator BlockI = Begin;
2063   for (; BlockI != End; ++BlockI) {
2064     MachineInstr &MI = *BlockI;
2065 
2066     // We run this before checking if an address is mergeable, because it can produce
2067     // better code even if the instructions aren't mergeable.
2068     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2069       Modified = true;
2070 
2071     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2072     // barriers. We can look after this barrier for separate merges.
2073     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2074       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2075 
2076       // Search will resume after this instruction in a separate merge list.
2077       ++BlockI;
2078       break;
2079     }
2080 
2081     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2082     if (InstClass == UNKNOWN)
2083       continue;
2084 
2085     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2086     int Swizzled =
2087         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2088     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2089       continue;
2090 
2091     CombineInfo CI;
2092     CI.setMI(MI, *this);
2093     CI.Order = Order++;
2094 
2095     if (!CI.hasMergeableAddress(*MRI))
2096       continue;
2097 
2098     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2099       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2100       //        operands. However we are reporting that ds_write2 shall have
2101       //        only VGPR data so that machine copy propagation does not
2102       //        create an illegal instruction with a VGPR and AGPR sources.
2103       //        Consequenctially if we create such instruction the verifier
2104       //        will complain.
2105       continue;
2106     }
2107 
2108     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2109 
2110     addInstToMergeableList(CI, MergeableInsts);
2111   }
2112 
2113   // At this point we have lists of Mergeable instructions.
2114   //
2115   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2116   // list try to find an instruction that can be merged with I.  If an instruction
2117   // is found, it is stored in the Paired field.  If no instructions are found, then
2118   // the CombineInfo object is deleted from the list.
2119 
2120   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2121                                                    E = MergeableInsts.end(); I != E;) {
2122 
2123     std::list<CombineInfo> &MergeList = *I;
2124     if (MergeList.size() <= 1) {
2125       // This means we have found only one instruction with a given address
2126       // that can be merged, and we need at least 2 instructions to do a merge,
2127       // so this list can be discarded.
2128       I = MergeableInsts.erase(I);
2129       continue;
2130     }
2131 
2132     // Sort the lists by offsets, this way mergeable instructions will be
2133     // adjacent to each other in the list, which will make it easier to find
2134     // matches.
2135     MergeList.sort(
2136         [] (const CombineInfo &A, const CombineInfo &B) {
2137           return A.Offset < B.Offset;
2138         });
2139     ++I;
2140   }
2141 
2142   return std::make_pair(BlockI, Modified);
2143 }
2144 
2145 // Scan through looking for adjacent LDS operations with constant offsets from
2146 // the same base register. We rely on the scheduler to do the hard work of
2147 // clustering nearby loads, and assume these are all adjacent.
2148 bool SILoadStoreOptimizer::optimizeBlock(
2149                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2150   bool Modified = false;
2151 
2152   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2153                                                    E = MergeableInsts.end(); I != E;) {
2154     std::list<CombineInfo> &MergeList = *I;
2155 
2156     bool OptimizeListAgain = false;
2157     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2158       // We weren't able to make any changes, so delete the list so we don't
2159       // process the same instructions the next time we try to optimize this
2160       // block.
2161       I = MergeableInsts.erase(I);
2162       continue;
2163     }
2164 
2165     Modified = true;
2166 
2167     // We made changes, but also determined that there were no more optimization
2168     // opportunities, so we don't need to reprocess the list
2169     if (!OptimizeListAgain) {
2170       I = MergeableInsts.erase(I);
2171       continue;
2172     }
2173     OptimizeAgain = true;
2174   }
2175   return Modified;
2176 }
2177 
2178 bool
2179 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2180                                           std::list<CombineInfo> &MergeList,
2181                                           bool &OptimizeListAgain) {
2182   if (MergeList.empty())
2183     return false;
2184 
2185   bool Modified = false;
2186 
2187   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2188        Next = std::next(I)) {
2189 
2190     auto First = I;
2191     auto Second = Next;
2192 
2193     if ((*First).Order > (*Second).Order)
2194       std::swap(First, Second);
2195     CombineInfo &CI = *First;
2196     CombineInfo &Paired = *Second;
2197 
2198     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2199     if (!Where) {
2200       ++I;
2201       continue;
2202     }
2203 
2204     Modified = true;
2205 
2206     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2207 
2208     MachineBasicBlock::iterator NewMI;
2209     switch (CI.InstClass) {
2210     default:
2211       llvm_unreachable("unknown InstClass");
2212       break;
2213     case DS_READ:
2214       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2215       break;
2216     case DS_WRITE:
2217       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2218       break;
2219     case S_BUFFER_LOAD_IMM:
2220       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2221       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2222       break;
2223     case BUFFER_LOAD:
2224       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2225       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2226       break;
2227     case BUFFER_STORE:
2228       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2229       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2230       break;
2231     case MIMG:
2232       NewMI = mergeImagePair(CI, Paired, Where->I);
2233       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2234       break;
2235     case TBUFFER_LOAD:
2236       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2237       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2238       break;
2239     case TBUFFER_STORE:
2240       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2241       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2242       break;
2243     case GLOBAL_LOAD:
2244     case GLOBAL_LOAD_SADDR:
2245       NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
2246       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2247       break;
2248     case GLOBAL_STORE:
2249     case GLOBAL_STORE_SADDR:
2250       NewMI = mergeGlobalStorePair(CI, Paired, Where->I);
2251       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2252       break;
2253     }
2254     CI.setMI(NewMI, *this);
2255     CI.Order = Where->Order;
2256     if (I == Second)
2257       I = Next;
2258 
2259     MergeList.erase(Second);
2260   }
2261 
2262   return Modified;
2263 }
2264 
2265 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2266   if (skipFunction(MF.getFunction()))
2267     return false;
2268 
2269   STM = &MF.getSubtarget<GCNSubtarget>();
2270   if (!STM->loadStoreOptEnabled())
2271     return false;
2272 
2273   TII = STM->getInstrInfo();
2274   TRI = &TII->getRegisterInfo();
2275 
2276   MRI = &MF.getRegInfo();
2277   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2278 
2279   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2280 
2281   bool Modified = false;
2282 
2283   // Contains the list of instructions for which constant offsets are being
2284   // promoted to the IMM. This is tracked for an entire block at time.
2285   SmallPtrSet<MachineInstr *, 4> AnchorList;
2286   MemInfoMap Visited;
2287 
2288   for (MachineBasicBlock &MBB : MF) {
2289     MachineBasicBlock::iterator SectionEnd;
2290     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2291          I = SectionEnd) {
2292       bool CollectModified;
2293       std::list<std::list<CombineInfo>> MergeableInsts;
2294 
2295       // First pass: Collect list of all instructions we know how to merge in a
2296       // subset of the block.
2297       std::tie(SectionEnd, CollectModified) =
2298           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2299 
2300       Modified |= CollectModified;
2301 
2302       do {
2303         OptimizeAgain = false;
2304         Modified |= optimizeBlock(MergeableInsts);
2305       } while (OptimizeAgain);
2306     }
2307 
2308     Visited.clear();
2309     AnchorList.clear();
2310   }
2311 
2312   return Modified;
2313 }
2314