1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82   GLOBAL_LOAD,
83   GLOBAL_LOAD_SADDR,
84   GLOBAL_STORE,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE
88 };
89 
90 struct AddressRegs {
91   unsigned char NumVAddrs = 0;
92   bool SBase = false;
93   bool SRsrc = false;
94   bool SOffset = false;
95   bool SAddr = false;
96   bool VAddr = false;
97   bool Addr = false;
98   bool SSamp = false;
99 };
100 
101 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
102 const unsigned MaxAddressRegs = 12 + 1 + 1;
103 
104 class SILoadStoreOptimizer : public MachineFunctionPass {
105   struct CombineInfo {
106     MachineBasicBlock::iterator I;
107     unsigned EltSize;
108     unsigned Offset;
109     unsigned Width;
110     unsigned Format;
111     unsigned BaseOff;
112     unsigned DMask;
113     InstClassEnum InstClass;
114     unsigned CPol = 0;
115     bool IsAGPR;
116     bool UseST64;
117     int AddrIdx[MaxAddressRegs];
118     const MachineOperand *AddrReg[MaxAddressRegs];
119     unsigned NumAddresses;
120     unsigned Order;
121 
122     bool hasSameBaseAddress(const MachineInstr &MI) {
123       for (unsigned i = 0; i < NumAddresses; i++) {
124         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
125 
126         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
127           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
128               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
129             return false;
130           }
131           continue;
132         }
133 
134         // Check same base pointer. Be careful of subregisters, which can occur
135         // with vectors of pointers.
136         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
137             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
138          return false;
139         }
140       }
141       return true;
142     }
143 
144     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
145       for (unsigned i = 0; i < NumAddresses; ++i) {
146         const MachineOperand *AddrOp = AddrReg[i];
147         // Immediates are always OK.
148         if (AddrOp->isImm())
149           continue;
150 
151         // Don't try to merge addresses that aren't either immediates or registers.
152         // TODO: Should be possible to merge FrameIndexes and maybe some other
153         // non-register
154         if (!AddrOp->isReg())
155           return false;
156 
157         // TODO: We should be able to merge physical reg addresses.
158         if (AddrOp->getReg().isPhysical())
159           return false;
160 
161         // If an address has only one use then there will be on other
162         // instructions with the same address, so we can't merge this one.
163         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
164           return false;
165       }
166       return true;
167     }
168 
169     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
170 
171     // Compare by pointer order.
172     bool operator<(const CombineInfo& Other) const {
173       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
174     }
175   };
176 
177   struct BaseRegisters {
178     Register LoReg;
179     Register HiReg;
180 
181     unsigned LoSubReg = 0;
182     unsigned HiSubReg = 0;
183   };
184 
185   struct MemAddress {
186     BaseRegisters Base;
187     int64_t Offset = 0;
188   };
189 
190   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
191 
192 private:
193   const GCNSubtarget *STM = nullptr;
194   const SIInstrInfo *TII = nullptr;
195   const SIRegisterInfo *TRI = nullptr;
196   MachineRegisterInfo *MRI = nullptr;
197   AliasAnalysis *AA = nullptr;
198   bool OptimizeAgain;
199 
200   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
201                            const DenseSet<Register> &ARegUses,
202                            const MachineInstr &A, const MachineInstr &B) const;
203   static bool dmasksCanBeCombined(const CombineInfo &CI,
204                                   const SIInstrInfo &TII,
205                                   const CombineInfo &Paired);
206   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
207                                    CombineInfo &Paired, bool Modify = false);
208   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
209                         const CombineInfo &Paired);
210   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
211   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
212                                                      const CombineInfo &Paired);
213   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
214                                                     const CombineInfo &Paired);
215   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
216 
217   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
218 
219   unsigned read2Opcode(unsigned EltSize) const;
220   unsigned read2ST64Opcode(unsigned EltSize) const;
221   MachineBasicBlock::iterator
222   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
223                  MachineBasicBlock::iterator InsertBefore);
224 
225   unsigned write2Opcode(unsigned EltSize) const;
226   unsigned write2ST64Opcode(unsigned EltSize) const;
227   MachineBasicBlock::iterator
228   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
229                   MachineBasicBlock::iterator InsertBefore);
230   MachineBasicBlock::iterator
231   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
232                  MachineBasicBlock::iterator InsertBefore);
233   MachineBasicBlock::iterator
234   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
235                           MachineBasicBlock::iterator InsertBefore);
236   MachineBasicBlock::iterator
237   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
238                       MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
241                        MachineBasicBlock::iterator InsertBefore);
242   MachineBasicBlock::iterator
243   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
244                        MachineBasicBlock::iterator InsertBefore);
245   MachineBasicBlock::iterator
246   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
247                         MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
250                     MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
253                      MachineBasicBlock::iterator InsertBefore);
254 
255   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
256                            int32_t NewOffset) const;
257   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
258   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
259   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
260   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
261   /// Promotes constant offset to the immediate by adjusting the base. It
262   /// tries to use a base from the nearby instructions that allows it to have
263   /// a 13bit constant offset which gets promoted to the immediate.
264   bool promoteConstantOffsetToImm(MachineInstr &CI,
265                                   MemInfoMap &Visited,
266                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
267   void addInstToMergeableList(const CombineInfo &CI,
268                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
269 
270   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
271       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
272       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
273       std::list<std::list<CombineInfo>> &MergeableInsts) const;
274 
275   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
276                                                      const CombineInfo &Paired);
277 
278 public:
279   static char ID;
280 
281   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
282     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
283   }
284 
285   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
286                                      bool &OptimizeListAgain);
287   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
288 
289   bool runOnMachineFunction(MachineFunction &MF) override;
290 
291   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
292 
293   void getAnalysisUsage(AnalysisUsage &AU) const override {
294     AU.setPreservesCFG();
295     AU.addRequired<AAResultsWrapperPass>();
296 
297     MachineFunctionPass::getAnalysisUsage(AU);
298   }
299 
300   MachineFunctionProperties getRequiredProperties() const override {
301     return MachineFunctionProperties()
302       .set(MachineFunctionProperties::Property::IsSSA);
303   }
304 };
305 
306 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
307   const unsigned Opc = MI.getOpcode();
308 
309   if (TII.isMUBUF(Opc)) {
310     // FIXME: Handle d16 correctly
311     return AMDGPU::getMUBUFElements(Opc);
312   }
313   if (TII.isMIMG(MI)) {
314     uint64_t DMaskImm =
315         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
316     return countPopulation(DMaskImm);
317   }
318   if (TII.isMTBUF(Opc)) {
319     return AMDGPU::getMTBUFElements(Opc);
320   }
321 
322   switch (Opc) {
323   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
324   case AMDGPU::GLOBAL_LOAD_DWORD:
325   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
326   case AMDGPU::GLOBAL_STORE_DWORD:
327   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
328   case AMDGPU::FLAT_LOAD_DWORD:
329   case AMDGPU::FLAT_STORE_DWORD:
330     return 1;
331   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
332   case AMDGPU::GLOBAL_LOAD_DWORDX2:
333   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
334   case AMDGPU::GLOBAL_STORE_DWORDX2:
335   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
336   case AMDGPU::FLAT_LOAD_DWORDX2:
337   case AMDGPU::FLAT_STORE_DWORDX2:
338     return 2;
339   case AMDGPU::GLOBAL_LOAD_DWORDX3:
340   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
341   case AMDGPU::GLOBAL_STORE_DWORDX3:
342   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
343   case AMDGPU::FLAT_LOAD_DWORDX3:
344   case AMDGPU::FLAT_STORE_DWORDX3:
345     return 3;
346   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
347   case AMDGPU::GLOBAL_LOAD_DWORDX4:
348   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
349   case AMDGPU::GLOBAL_STORE_DWORDX4:
350   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
351   case AMDGPU::FLAT_LOAD_DWORDX4:
352   case AMDGPU::FLAT_STORE_DWORDX4:
353     return 4;
354   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
355     return 8;
356   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
357   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
358   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
359   case AMDGPU::DS_WRITE_B32_gfx9:
360     return 1;
361   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
362   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
363   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
364   case AMDGPU::DS_WRITE_B64_gfx9:
365     return 2;
366   default:
367     return 0;
368   }
369 }
370 
371 /// Maps instruction opcode to enum InstClassEnum.
372 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
373   switch (Opc) {
374   default:
375     if (TII.isMUBUF(Opc)) {
376       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
377       default:
378         return UNKNOWN;
379       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
380       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
381       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
382       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
383         return BUFFER_LOAD;
384       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
385       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
386       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
387       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
388         return BUFFER_STORE;
389       }
390     }
391     if (TII.isMIMG(Opc)) {
392       // Ignore instructions encoded without vaddr.
393       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
394           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
395         return UNKNOWN;
396       // Ignore BVH instructions
397       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
398         return UNKNOWN;
399       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
400       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
401           TII.isGather4(Opc))
402         return UNKNOWN;
403       return MIMG;
404     }
405     if (TII.isMTBUF(Opc)) {
406       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
407       default:
408         return UNKNOWN;
409       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
410       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
411       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
412       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
413         return TBUFFER_LOAD;
414       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
415       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
416       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
417       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
418         return TBUFFER_STORE;
419       }
420     }
421     return UNKNOWN;
422   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
423   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
424   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
425   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
426     return S_BUFFER_LOAD_IMM;
427   case AMDGPU::DS_READ_B32:
428   case AMDGPU::DS_READ_B32_gfx9:
429   case AMDGPU::DS_READ_B64:
430   case AMDGPU::DS_READ_B64_gfx9:
431     return DS_READ;
432   case AMDGPU::DS_WRITE_B32:
433   case AMDGPU::DS_WRITE_B32_gfx9:
434   case AMDGPU::DS_WRITE_B64:
435   case AMDGPU::DS_WRITE_B64_gfx9:
436     return DS_WRITE;
437   case AMDGPU::GLOBAL_LOAD_DWORD:
438   case AMDGPU::GLOBAL_LOAD_DWORDX2:
439   case AMDGPU::GLOBAL_LOAD_DWORDX3:
440   case AMDGPU::GLOBAL_LOAD_DWORDX4:
441     return GLOBAL_LOAD;
442   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
443   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
444   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
445   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
446     return GLOBAL_LOAD_SADDR;
447   case AMDGPU::GLOBAL_STORE_DWORD:
448   case AMDGPU::GLOBAL_STORE_DWORDX2:
449   case AMDGPU::GLOBAL_STORE_DWORDX3:
450   case AMDGPU::GLOBAL_STORE_DWORDX4:
451     return GLOBAL_STORE;
452   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
453   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
454   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
455   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
456     return GLOBAL_STORE_SADDR;
457   case AMDGPU::FLAT_LOAD_DWORD:
458   case AMDGPU::FLAT_LOAD_DWORDX2:
459   case AMDGPU::FLAT_LOAD_DWORDX3:
460   case AMDGPU::FLAT_LOAD_DWORDX4:
461     return FLAT_LOAD;
462   case AMDGPU::FLAT_STORE_DWORD:
463   case AMDGPU::FLAT_STORE_DWORDX2:
464   case AMDGPU::FLAT_STORE_DWORDX3:
465   case AMDGPU::FLAT_STORE_DWORDX4:
466     return FLAT_STORE;
467   }
468 }
469 
470 /// Determines instruction subclass from opcode. Only instructions
471 /// of the same subclass can be merged together. The merged instruction may have
472 /// a different subclass but must have the same class.
473 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
474   switch (Opc) {
475   default:
476     if (TII.isMUBUF(Opc))
477       return AMDGPU::getMUBUFBaseOpcode(Opc);
478     if (TII.isMIMG(Opc)) {
479       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
480       assert(Info);
481       return Info->BaseOpcode;
482     }
483     if (TII.isMTBUF(Opc))
484       return AMDGPU::getMTBUFBaseOpcode(Opc);
485     return -1;
486   case AMDGPU::DS_READ_B32:
487   case AMDGPU::DS_READ_B32_gfx9:
488   case AMDGPU::DS_READ_B64:
489   case AMDGPU::DS_READ_B64_gfx9:
490   case AMDGPU::DS_WRITE_B32:
491   case AMDGPU::DS_WRITE_B32_gfx9:
492   case AMDGPU::DS_WRITE_B64:
493   case AMDGPU::DS_WRITE_B64_gfx9:
494     return Opc;
495   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
496   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
497   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
498   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
499     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
500   case AMDGPU::GLOBAL_LOAD_DWORD:
501   case AMDGPU::GLOBAL_LOAD_DWORDX2:
502   case AMDGPU::GLOBAL_LOAD_DWORDX3:
503   case AMDGPU::GLOBAL_LOAD_DWORDX4:
504     return AMDGPU::GLOBAL_LOAD_DWORD;
505   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
506   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
507   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
508   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
509     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
510   case AMDGPU::GLOBAL_STORE_DWORD:
511   case AMDGPU::GLOBAL_STORE_DWORDX2:
512   case AMDGPU::GLOBAL_STORE_DWORDX3:
513   case AMDGPU::GLOBAL_STORE_DWORDX4:
514     return AMDGPU::GLOBAL_STORE_DWORD;
515   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
516   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
517   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
518   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
519     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
520   case AMDGPU::FLAT_LOAD_DWORD:
521   case AMDGPU::FLAT_LOAD_DWORDX2:
522   case AMDGPU::FLAT_LOAD_DWORDX3:
523   case AMDGPU::FLAT_LOAD_DWORDX4:
524     return AMDGPU::FLAT_LOAD_DWORD;
525   case AMDGPU::FLAT_STORE_DWORD:
526   case AMDGPU::FLAT_STORE_DWORDX2:
527   case AMDGPU::FLAT_STORE_DWORDX3:
528   case AMDGPU::FLAT_STORE_DWORDX4:
529     return AMDGPU::FLAT_STORE_DWORD;
530   }
531 }
532 
533 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
534   AddressRegs Result;
535 
536   if (TII.isMUBUF(Opc)) {
537     if (AMDGPU::getMUBUFHasVAddr(Opc))
538       Result.VAddr = true;
539     if (AMDGPU::getMUBUFHasSrsrc(Opc))
540       Result.SRsrc = true;
541     if (AMDGPU::getMUBUFHasSoffset(Opc))
542       Result.SOffset = true;
543 
544     return Result;
545   }
546 
547   if (TII.isMIMG(Opc)) {
548     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
549     if (VAddr0Idx >= 0) {
550       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
551       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
552     } else {
553       Result.VAddr = true;
554     }
555     Result.SRsrc = true;
556     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
557     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
558       Result.SSamp = true;
559 
560     return Result;
561   }
562   if (TII.isMTBUF(Opc)) {
563     if (AMDGPU::getMTBUFHasVAddr(Opc))
564       Result.VAddr = true;
565     if (AMDGPU::getMTBUFHasSrsrc(Opc))
566       Result.SRsrc = true;
567     if (AMDGPU::getMTBUFHasSoffset(Opc))
568       Result.SOffset = true;
569 
570     return Result;
571   }
572 
573   switch (Opc) {
574   default:
575     return Result;
576   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
577   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
578   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
579   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
580     Result.SBase = true;
581     return Result;
582   case AMDGPU::DS_READ_B32:
583   case AMDGPU::DS_READ_B64:
584   case AMDGPU::DS_READ_B32_gfx9:
585   case AMDGPU::DS_READ_B64_gfx9:
586   case AMDGPU::DS_WRITE_B32:
587   case AMDGPU::DS_WRITE_B64:
588   case AMDGPU::DS_WRITE_B32_gfx9:
589   case AMDGPU::DS_WRITE_B64_gfx9:
590     Result.Addr = true;
591     return Result;
592   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
593   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
594   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
595   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
596   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
597   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
598   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
599   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
600     Result.SAddr = true;
601     LLVM_FALLTHROUGH;
602   case AMDGPU::GLOBAL_LOAD_DWORD:
603   case AMDGPU::GLOBAL_LOAD_DWORDX2:
604   case AMDGPU::GLOBAL_LOAD_DWORDX3:
605   case AMDGPU::GLOBAL_LOAD_DWORDX4:
606   case AMDGPU::GLOBAL_STORE_DWORD:
607   case AMDGPU::GLOBAL_STORE_DWORDX2:
608   case AMDGPU::GLOBAL_STORE_DWORDX3:
609   case AMDGPU::GLOBAL_STORE_DWORDX4:
610   case AMDGPU::FLAT_LOAD_DWORD:
611   case AMDGPU::FLAT_LOAD_DWORDX2:
612   case AMDGPU::FLAT_LOAD_DWORDX3:
613   case AMDGPU::FLAT_LOAD_DWORDX4:
614   case AMDGPU::FLAT_STORE_DWORD:
615   case AMDGPU::FLAT_STORE_DWORDX2:
616   case AMDGPU::FLAT_STORE_DWORDX3:
617   case AMDGPU::FLAT_STORE_DWORDX4:
618     Result.VAddr = true;
619     return Result;
620   }
621 }
622 
623 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
624                                               const SILoadStoreOptimizer &LSO) {
625   I = MI;
626   unsigned Opc = MI->getOpcode();
627   InstClass = getInstClass(Opc, *LSO.TII);
628 
629   if (InstClass == UNKNOWN)
630     return;
631 
632   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
633 
634   switch (InstClass) {
635   case DS_READ:
636    EltSize =
637           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
638                                                                           : 4;
639    break;
640   case DS_WRITE:
641     EltSize =
642           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
643                                                                             : 4;
644     break;
645   case S_BUFFER_LOAD_IMM:
646     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
647     break;
648   default:
649     EltSize = 4;
650     break;
651   }
652 
653   if (InstClass == MIMG) {
654     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
655     // Offset is not considered for MIMG instructions.
656     Offset = 0;
657   } else {
658     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
659     Offset = I->getOperand(OffsetIdx).getImm();
660   }
661 
662   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
663     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
664 
665   Width = getOpcodeWidth(*I, *LSO.TII);
666 
667   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
668     Offset &= 0xffff;
669   } else if (InstClass != MIMG) {
670     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
671   }
672 
673   AddressRegs Regs = getRegs(Opc, *LSO.TII);
674 
675   NumAddresses = 0;
676   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
677     AddrIdx[NumAddresses++] =
678         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
679   if (Regs.Addr)
680     AddrIdx[NumAddresses++] =
681         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
682   if (Regs.SBase)
683     AddrIdx[NumAddresses++] =
684         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
685   if (Regs.SRsrc)
686     AddrIdx[NumAddresses++] =
687         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
688   if (Regs.SOffset)
689     AddrIdx[NumAddresses++] =
690         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
691   if (Regs.SAddr)
692     AddrIdx[NumAddresses++] =
693         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
694   if (Regs.VAddr)
695     AddrIdx[NumAddresses++] =
696         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
697   if (Regs.SSamp)
698     AddrIdx[NumAddresses++] =
699         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
700   assert(NumAddresses <= MaxAddressRegs);
701 
702   for (unsigned J = 0; J < NumAddresses; J++)
703     AddrReg[J] = &I->getOperand(AddrIdx[J]);
704 }
705 
706 } // end anonymous namespace.
707 
708 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
709                       "SI Load Store Optimizer", false, false)
710 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
711 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
712                     false, false)
713 
714 char SILoadStoreOptimizer::ID = 0;
715 
716 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
717 
718 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
719   return new SILoadStoreOptimizer();
720 }
721 
722 static void addDefsUsesToList(const MachineInstr &MI,
723                               DenseSet<Register> &RegDefs,
724                               DenseSet<Register> &RegUses) {
725   for (const auto &Op : MI.operands()) {
726     if (!Op.isReg())
727       continue;
728     if (Op.isDef())
729       RegDefs.insert(Op.getReg());
730     if (Op.readsReg())
731       RegUses.insert(Op.getReg());
732   }
733 }
734 
735 bool SILoadStoreOptimizer::canSwapInstructions(
736     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
737     const MachineInstr &A, const MachineInstr &B) const {
738   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
739       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
740     return false;
741   for (const auto &BOp : B.operands()) {
742     if (!BOp.isReg())
743       continue;
744     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
745       return false;
746     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
747       return false;
748   }
749   return true;
750 }
751 
752 // Given that \p CI and \p Paired are adjacent memory operations produce a new
753 // MMO for the combined operation with a new access size.
754 MachineMemOperand *
755 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
756                                                const CombineInfo &Paired) {
757   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
758   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
759 
760   unsigned Size = MMOa->getSize() + MMOb->getSize();
761 
762   // A base pointer for the combined operation is the same as the leading
763   // operation's pointer.
764   if (Paired < CI)
765     MMOa = MMOb;
766 
767   MachineFunction *MF = CI.I->getMF();
768   return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size);
769 }
770 
771 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
772                                                const SIInstrInfo &TII,
773                                                const CombineInfo &Paired) {
774   assert(CI.InstClass == MIMG);
775 
776   // Ignore instructions with tfe/lwe set.
777   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
778   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
779 
780   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
781     return false;
782 
783   // Check other optional immediate operands for equality.
784   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
785                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
786                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
787 
788   for (auto op : OperandsToMatch) {
789     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
790     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
791       return false;
792     if (Idx != -1 &&
793         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
794       return false;
795   }
796 
797   // Check DMask for overlaps.
798   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
799   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
800 
801   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
802   if ((1u << AllowedBitsForMin) <= MinMask)
803     return false;
804 
805   return true;
806 }
807 
808 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
809                                        unsigned ComponentCount,
810                                        const GCNSubtarget &STI) {
811   if (ComponentCount > 4)
812     return 0;
813 
814   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
815       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
816   if (!OldFormatInfo)
817     return 0;
818 
819   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
820       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
821                                            ComponentCount,
822                                            OldFormatInfo->NumFormat, STI);
823 
824   if (!NewFormatInfo)
825     return 0;
826 
827   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
828          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
829 
830   return NewFormatInfo->Format;
831 }
832 
833 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
834 // highest power of two. Note that the result is well defined for all inputs
835 // including corner cases like:
836 // - if Lo == Hi, return that value
837 // - if Lo == 0, return 0 (even though the "- 1" below underflows
838 // - if Lo > Hi, return 0 (as if the range wrapped around)
839 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
840   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
841 }
842 
843 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
844                                                 const GCNSubtarget &STI,
845                                                 CombineInfo &Paired,
846                                                 bool Modify) {
847   assert(CI.InstClass != MIMG);
848 
849   // XXX - Would the same offset be OK? Is there any reason this would happen or
850   // be useful?
851   if (CI.Offset == Paired.Offset)
852     return false;
853 
854   // This won't be valid if the offset isn't aligned.
855   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
856     return false;
857 
858   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
859 
860     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
861         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
862     if (!Info0)
863       return false;
864     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
865         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
866     if (!Info1)
867       return false;
868 
869     if (Info0->BitsPerComp != Info1->BitsPerComp ||
870         Info0->NumFormat != Info1->NumFormat)
871       return false;
872 
873     // TODO: Should be possible to support more formats, but if format loads
874     // are not dword-aligned, the merged load might not be valid.
875     if (Info0->BitsPerComp != 32)
876       return false;
877 
878     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
879       return false;
880   }
881 
882   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
883   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
884   CI.UseST64 = false;
885   CI.BaseOff = 0;
886 
887   // Handle all non-DS instructions.
888   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
889     return (EltOffset0 + CI.Width == EltOffset1 ||
890             EltOffset1 + Paired.Width == EltOffset0) &&
891            CI.CPol == Paired.CPol;
892   }
893 
894   // If the offset in elements doesn't fit in 8-bits, we might be able to use
895   // the stride 64 versions.
896   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
897       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
898     if (Modify) {
899       CI.Offset = EltOffset0 / 64;
900       Paired.Offset = EltOffset1 / 64;
901       CI.UseST64 = true;
902     }
903     return true;
904   }
905 
906   // Check if the new offsets fit in the reduced 8-bit range.
907   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
908     if (Modify) {
909       CI.Offset = EltOffset0;
910       Paired.Offset = EltOffset1;
911     }
912     return true;
913   }
914 
915   // Try to shift base address to decrease offsets.
916   uint32_t Min = std::min(EltOffset0, EltOffset1);
917   uint32_t Max = std::max(EltOffset0, EltOffset1);
918 
919   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
920   if (((Max - Min) & ~Mask) == 0) {
921     if (Modify) {
922       // From the range of values we could use for BaseOff, choose the one that
923       // is aligned to the highest power of two, to maximise the chance that
924       // the same offset can be reused for other load/store pairs.
925       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
926       // Copy the low bits of the offsets, so that when we adjust them by
927       // subtracting BaseOff they will be multiples of 64.
928       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
929       CI.BaseOff = BaseOff * CI.EltSize;
930       CI.Offset = (EltOffset0 - BaseOff) / 64;
931       Paired.Offset = (EltOffset1 - BaseOff) / 64;
932       CI.UseST64 = true;
933     }
934     return true;
935   }
936 
937   if (isUInt<8>(Max - Min)) {
938     if (Modify) {
939       // From the range of values we could use for BaseOff, choose the one that
940       // is aligned to the highest power of two, to maximise the chance that
941       // the same offset can be reused for other load/store pairs.
942       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
943       CI.BaseOff = BaseOff * CI.EltSize;
944       CI.Offset = EltOffset0 - BaseOff;
945       Paired.Offset = EltOffset1 - BaseOff;
946     }
947     return true;
948   }
949 
950   return false;
951 }
952 
953 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
954                                      const CombineInfo &CI,
955                                      const CombineInfo &Paired) {
956   const unsigned Width = (CI.Width + Paired.Width);
957   switch (CI.InstClass) {
958   default:
959     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
960   case S_BUFFER_LOAD_IMM:
961     switch (Width) {
962     default:
963       return false;
964     case 2:
965     case 4:
966     case 8:
967       return true;
968     }
969   }
970 }
971 
972 const TargetRegisterClass *
973 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
974   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
975     return TRI->getRegClassForReg(*MRI, Dst->getReg());
976   }
977   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
978     return TRI->getRegClassForReg(*MRI, Src->getReg());
979   }
980   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
981     return TRI->getRegClassForReg(*MRI, Src->getReg());
982   }
983   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
984     return TRI->getRegClassForReg(*MRI, Dst->getReg());
985   }
986   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
987     return TRI->getRegClassForReg(*MRI, Src->getReg());
988   }
989   return nullptr;
990 }
991 
992 /// This function assumes that CI comes before Paired in a basic block. Return
993 /// an insertion point for the merged instruction or nullptr on failure.
994 SILoadStoreOptimizer::CombineInfo *
995 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
996                                            CombineInfo &Paired) {
997   // If another instruction has already been merged into CI, it may now be a
998   // type that we can't do any further merging into.
999   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1000     return nullptr;
1001   assert(CI.InstClass == Paired.InstClass);
1002 
1003   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1004       getInstSubclass(Paired.I->getOpcode(), *TII))
1005     return nullptr;
1006 
1007   // Check both offsets (or masks for MIMG) can be combined and fit in the
1008   // reduced range.
1009   if (CI.InstClass == MIMG) {
1010     if (!dmasksCanBeCombined(CI, *TII, Paired))
1011       return nullptr;
1012   } else {
1013     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1014       return nullptr;
1015   }
1016 
1017   DenseSet<Register> RegDefs;
1018   DenseSet<Register> RegUses;
1019   CombineInfo *Where;
1020   if (CI.I->mayLoad()) {
1021     // Try to hoist Paired up to CI.
1022     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1023     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1024       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1025         return nullptr;
1026     }
1027     Where = &CI;
1028   } else {
1029     // Try to sink CI down to Paired.
1030     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1031     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1032       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1033         return nullptr;
1034     }
1035     Where = &Paired;
1036   }
1037 
1038   // Call offsetsCanBeCombined with modify = true so that the offsets are
1039   // correct for the new instruction.  This should return true, because
1040   // this function should only be called on CombineInfo objects that
1041   // have already been confirmed to be mergeable.
1042   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1043     offsetsCanBeCombined(CI, *STM, Paired, true);
1044   return Where;
1045 }
1046 
1047 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1048   if (STM->ldsRequiresM0Init())
1049     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1050   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1051 }
1052 
1053 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1054   if (STM->ldsRequiresM0Init())
1055     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1056 
1057   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1058                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1059 }
1060 
1061 MachineBasicBlock::iterator
1062 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1063                                      MachineBasicBlock::iterator InsertBefore) {
1064   MachineBasicBlock *MBB = CI.I->getParent();
1065 
1066   // Be careful, since the addresses could be subregisters themselves in weird
1067   // cases, like vectors of pointers.
1068   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1069 
1070   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1071   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1072 
1073   unsigned NewOffset0 = CI.Offset;
1074   unsigned NewOffset1 = Paired.Offset;
1075   unsigned Opc =
1076       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1077 
1078   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1079   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1080 
1081   if (NewOffset0 > NewOffset1) {
1082     // Canonicalize the merged instruction so the smaller offset comes first.
1083     std::swap(NewOffset0, NewOffset1);
1084     std::swap(SubRegIdx0, SubRegIdx1);
1085   }
1086 
1087   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1088          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1089 
1090   const MCInstrDesc &Read2Desc = TII->get(Opc);
1091 
1092   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1093   Register DestReg = MRI->createVirtualRegister(SuperRC);
1094 
1095   DebugLoc DL = CI.I->getDebugLoc();
1096 
1097   Register BaseReg = AddrReg->getReg();
1098   unsigned BaseSubReg = AddrReg->getSubReg();
1099   unsigned BaseRegFlags = 0;
1100   if (CI.BaseOff) {
1101     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1102     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1103         .addImm(CI.BaseOff);
1104 
1105     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1106     BaseRegFlags = RegState::Kill;
1107 
1108     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1109         .addReg(ImmReg)
1110         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1111         .addImm(0); // clamp bit
1112     BaseSubReg = 0;
1113   }
1114 
1115   MachineInstrBuilder Read2 =
1116       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1117           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1118           .addImm(NewOffset0)                        // offset0
1119           .addImm(NewOffset1)                        // offset1
1120           .addImm(0)                                 // gds
1121           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1122 
1123   (void)Read2;
1124 
1125   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1126 
1127   // Copy to the old destination registers.
1128   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1129       .add(*Dest0) // Copy to same destination including flags and sub reg.
1130       .addReg(DestReg, 0, SubRegIdx0);
1131   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1132       .add(*Dest1)
1133       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1134 
1135   CI.I->eraseFromParent();
1136   Paired.I->eraseFromParent();
1137 
1138   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1139   return Read2;
1140 }
1141 
1142 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1143   if (STM->ldsRequiresM0Init())
1144     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1145   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1146                         : AMDGPU::DS_WRITE2_B64_gfx9;
1147 }
1148 
1149 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1150   if (STM->ldsRequiresM0Init())
1151     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1152                           : AMDGPU::DS_WRITE2ST64_B64;
1153 
1154   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1155                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1156 }
1157 
1158 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1159     CombineInfo &CI, CombineInfo &Paired,
1160     MachineBasicBlock::iterator InsertBefore) {
1161   MachineBasicBlock *MBB = CI.I->getParent();
1162 
1163   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1164   // sure we preserve the subregister index and any register flags set on them.
1165   const MachineOperand *AddrReg =
1166       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1167   const MachineOperand *Data0 =
1168       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1169   const MachineOperand *Data1 =
1170       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1171 
1172   unsigned NewOffset0 = CI.Offset;
1173   unsigned NewOffset1 = Paired.Offset;
1174   unsigned Opc =
1175       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1176 
1177   if (NewOffset0 > NewOffset1) {
1178     // Canonicalize the merged instruction so the smaller offset comes first.
1179     std::swap(NewOffset0, NewOffset1);
1180     std::swap(Data0, Data1);
1181   }
1182 
1183   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1184          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1185 
1186   const MCInstrDesc &Write2Desc = TII->get(Opc);
1187   DebugLoc DL = CI.I->getDebugLoc();
1188 
1189   Register BaseReg = AddrReg->getReg();
1190   unsigned BaseSubReg = AddrReg->getSubReg();
1191   unsigned BaseRegFlags = 0;
1192   if (CI.BaseOff) {
1193     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1194     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1195         .addImm(CI.BaseOff);
1196 
1197     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1198     BaseRegFlags = RegState::Kill;
1199 
1200     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1201         .addReg(ImmReg)
1202         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1203         .addImm(0); // clamp bit
1204     BaseSubReg = 0;
1205   }
1206 
1207   MachineInstrBuilder Write2 =
1208       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1209           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1210           .add(*Data0)                               // data0
1211           .add(*Data1)                               // data1
1212           .addImm(NewOffset0)                        // offset0
1213           .addImm(NewOffset1)                        // offset1
1214           .addImm(0)                                 // gds
1215           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1216 
1217   CI.I->eraseFromParent();
1218   Paired.I->eraseFromParent();
1219 
1220   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1221   return Write2;
1222 }
1223 
1224 MachineBasicBlock::iterator
1225 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1226                                      MachineBasicBlock::iterator InsertBefore) {
1227   MachineBasicBlock *MBB = CI.I->getParent();
1228   DebugLoc DL = CI.I->getDebugLoc();
1229   const unsigned Opcode = getNewOpcode(CI, Paired);
1230 
1231   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1232 
1233   Register DestReg = MRI->createVirtualRegister(SuperRC);
1234   unsigned MergedDMask = CI.DMask | Paired.DMask;
1235   unsigned DMaskIdx =
1236       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1237 
1238   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1239   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1240     if (I == DMaskIdx)
1241       MIB.addImm(MergedDMask);
1242     else
1243       MIB.add((*CI.I).getOperand(I));
1244   }
1245 
1246   // It shouldn't be possible to get this far if the two instructions
1247   // don't have a single memoperand, because MachineInstr::mayAlias()
1248   // will return true if this is the case.
1249   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1250 
1251   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1252 
1253   unsigned SubRegIdx0, SubRegIdx1;
1254   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1255 
1256   // Copy to the old destination registers.
1257   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1258   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1259   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1260 
1261   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1262       .add(*Dest0) // Copy to same destination including flags and sub reg.
1263       .addReg(DestReg, 0, SubRegIdx0);
1264   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1265       .add(*Dest1)
1266       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1267 
1268   CI.I->eraseFromParent();
1269   Paired.I->eraseFromParent();
1270   return New;
1271 }
1272 
1273 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1274     CombineInfo &CI, CombineInfo &Paired,
1275     MachineBasicBlock::iterator InsertBefore) {
1276   MachineBasicBlock *MBB = CI.I->getParent();
1277   DebugLoc DL = CI.I->getDebugLoc();
1278   const unsigned Opcode = getNewOpcode(CI, Paired);
1279 
1280   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1281 
1282   Register DestReg = MRI->createVirtualRegister(SuperRC);
1283   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1284 
1285   // It shouldn't be possible to get this far if the two instructions
1286   // don't have a single memoperand, because MachineInstr::mayAlias()
1287   // will return true if this is the case.
1288   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1289 
1290   MachineInstr *New =
1291       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1292           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1293           .addImm(MergedOffset) // offset
1294           .addImm(CI.CPol)      // cpol
1295           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1296 
1297   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1298   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1299   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1300 
1301   // Copy to the old destination registers.
1302   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1303   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1304   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1305 
1306   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1307       .add(*Dest0) // Copy to same destination including flags and sub reg.
1308       .addReg(DestReg, 0, SubRegIdx0);
1309   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1310       .add(*Dest1)
1311       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1312 
1313   CI.I->eraseFromParent();
1314   Paired.I->eraseFromParent();
1315   return New;
1316 }
1317 
1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319     CombineInfo &CI, CombineInfo &Paired,
1320     MachineBasicBlock::iterator InsertBefore) {
1321   MachineBasicBlock *MBB = CI.I->getParent();
1322   DebugLoc DL = CI.I->getDebugLoc();
1323 
1324   const unsigned Opcode = getNewOpcode(CI, Paired);
1325 
1326   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327 
1328   // Copy to the new source register.
1329   Register DestReg = MRI->createVirtualRegister(SuperRC);
1330   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331 
1332   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1333 
1334   AddressRegs Regs = getRegs(Opcode, *TII);
1335 
1336   if (Regs.VAddr)
1337     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338 
1339   // It shouldn't be possible to get this far if the two instructions
1340   // don't have a single memoperand, because MachineInstr::mayAlias()
1341   // will return true if this is the case.
1342   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1343 
1344   MachineInstr *New =
1345     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1346         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1347         .addImm(MergedOffset) // offset
1348         .addImm(CI.CPol)      // cpol
1349         .addImm(0)            // tfe
1350         .addImm(0)            // swz
1351         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1352 
1353   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1354   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1355   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1356 
1357   // Copy to the old destination registers.
1358   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1359   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1360   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1361 
1362   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1363       .add(*Dest0) // Copy to same destination including flags and sub reg.
1364       .addReg(DestReg, 0, SubRegIdx0);
1365   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1366       .add(*Dest1)
1367       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1368 
1369   CI.I->eraseFromParent();
1370   Paired.I->eraseFromParent();
1371   return New;
1372 }
1373 
1374 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1375     CombineInfo &CI, CombineInfo &Paired,
1376     MachineBasicBlock::iterator InsertBefore) {
1377   MachineBasicBlock *MBB = CI.I->getParent();
1378   DebugLoc DL = CI.I->getDebugLoc();
1379 
1380   const unsigned Opcode = getNewOpcode(CI, Paired);
1381 
1382   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1383 
1384   // Copy to the new source register.
1385   Register DestReg = MRI->createVirtualRegister(SuperRC);
1386   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1387 
1388   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1389 
1390   AddressRegs Regs = getRegs(Opcode, *TII);
1391 
1392   if (Regs.VAddr)
1393     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1394 
1395   unsigned JoinedFormat =
1396       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1397 
1398   // It shouldn't be possible to get this far if the two instructions
1399   // don't have a single memoperand, because MachineInstr::mayAlias()
1400   // will return true if this is the case.
1401   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1402 
1403   MachineInstr *New =
1404       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1405           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1406           .addImm(MergedOffset) // offset
1407           .addImm(JoinedFormat) // format
1408           .addImm(CI.CPol)      // cpol
1409           .addImm(0)            // tfe
1410           .addImm(0)            // swz
1411           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1412 
1413   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1414   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1415   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1416 
1417   // Copy to the old destination registers.
1418   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1419   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1420   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1421 
1422   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1423       .add(*Dest0) // Copy to same destination including flags and sub reg.
1424       .addReg(DestReg, 0, SubRegIdx0);
1425   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1426       .add(*Dest1)
1427       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1428 
1429   CI.I->eraseFromParent();
1430   Paired.I->eraseFromParent();
1431   return New;
1432 }
1433 
1434 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1435     CombineInfo &CI, CombineInfo &Paired,
1436     MachineBasicBlock::iterator InsertBefore) {
1437   MachineBasicBlock *MBB = CI.I->getParent();
1438   DebugLoc DL = CI.I->getDebugLoc();
1439 
1440   const unsigned Opcode = getNewOpcode(CI, Paired);
1441 
1442   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1443   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1444   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1445 
1446   // Copy to the new source register.
1447   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1448   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1449 
1450   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1451   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1452 
1453   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1454       .add(*Src0)
1455       .addImm(SubRegIdx0)
1456       .add(*Src1)
1457       .addImm(SubRegIdx1);
1458 
1459   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1460                  .addReg(SrcReg, RegState::Kill);
1461 
1462   AddressRegs Regs = getRegs(Opcode, *TII);
1463 
1464   if (Regs.VAddr)
1465     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1466 
1467   unsigned JoinedFormat =
1468       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1469 
1470   // It shouldn't be possible to get this far if the two instructions
1471   // don't have a single memoperand, because MachineInstr::mayAlias()
1472   // will return true if this is the case.
1473   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1474 
1475   MachineInstr *New =
1476       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1477           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1478           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1479           .addImm(JoinedFormat)                     // format
1480           .addImm(CI.CPol)                          // cpol
1481           .addImm(0)                                // tfe
1482           .addImm(0)                                // swz
1483           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1484 
1485   CI.I->eraseFromParent();
1486   Paired.I->eraseFromParent();
1487   return New;
1488 }
1489 
1490 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1491     CombineInfo &CI, CombineInfo &Paired,
1492     MachineBasicBlock::iterator InsertBefore) {
1493   MachineBasicBlock *MBB = CI.I->getParent();
1494   DebugLoc DL = CI.I->getDebugLoc();
1495 
1496   const unsigned Opcode = getNewOpcode(CI, Paired);
1497 
1498   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1499   Register DestReg = MRI->createVirtualRegister(SuperRC);
1500 
1501   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1502 
1503   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1504     MIB.add(*SAddr);
1505 
1506   MachineInstr *New =
1507     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1508        .addImm(std::min(CI.Offset, Paired.Offset))
1509        .addImm(CI.CPol)
1510        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1511 
1512   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1513   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1514   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1515 
1516   // Copy to the old destination registers.
1517   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1518   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1519   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1520 
1521   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1522       .add(*Dest0) // Copy to same destination including flags and sub reg.
1523       .addReg(DestReg, 0, SubRegIdx0);
1524   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1525       .add(*Dest1)
1526       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1527 
1528   CI.I->eraseFromParent();
1529   Paired.I->eraseFromParent();
1530   return New;
1531 }
1532 
1533 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1534     CombineInfo &CI, CombineInfo &Paired,
1535     MachineBasicBlock::iterator InsertBefore) {
1536   MachineBasicBlock *MBB = CI.I->getParent();
1537   DebugLoc DL = CI.I->getDebugLoc();
1538 
1539   const unsigned Opcode = getNewOpcode(CI, Paired);
1540 
1541   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1542   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1543   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1544 
1545   // Copy to the new source register.
1546   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1547   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1548 
1549   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1551 
1552   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1553       .add(*Src0)
1554       .addImm(SubRegIdx0)
1555       .add(*Src1)
1556       .addImm(SubRegIdx1);
1557 
1558   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1559                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1560                  .addReg(SrcReg, RegState::Kill);
1561 
1562   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1563     MIB.add(*SAddr);
1564 
1565   MachineInstr *New =
1566     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1567        .addImm(CI.CPol)
1568        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1569 
1570   CI.I->eraseFromParent();
1571   Paired.I->eraseFromParent();
1572   return New;
1573 }
1574 
1575 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1576                                             const CombineInfo &Paired) {
1577   const unsigned Width = CI.Width + Paired.Width;
1578 
1579   switch (CI.InstClass) {
1580   default:
1581     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1582     // FIXME: Handle d16 correctly
1583     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1584                                   Width);
1585   case TBUFFER_LOAD:
1586   case TBUFFER_STORE:
1587     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1588                                   Width);
1589 
1590   case UNKNOWN:
1591     llvm_unreachable("Unknown instruction class");
1592   case S_BUFFER_LOAD_IMM:
1593     switch (Width) {
1594     default:
1595       return 0;
1596     case 2:
1597       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1598     case 4:
1599       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1600     case 8:
1601       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1602     }
1603   case GLOBAL_LOAD:
1604     switch (Width) {
1605     default:
1606       return 0;
1607     case 2:
1608       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1609     case 3:
1610       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1611     case 4:
1612       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1613     }
1614   case GLOBAL_LOAD_SADDR:
1615     switch (Width) {
1616     default:
1617       return 0;
1618     case 2:
1619       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1620     case 3:
1621       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1622     case 4:
1623       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1624     }
1625   case GLOBAL_STORE:
1626     switch (Width) {
1627     default:
1628       return 0;
1629     case 2:
1630       return AMDGPU::GLOBAL_STORE_DWORDX2;
1631     case 3:
1632       return AMDGPU::GLOBAL_STORE_DWORDX3;
1633     case 4:
1634       return AMDGPU::GLOBAL_STORE_DWORDX4;
1635     }
1636   case GLOBAL_STORE_SADDR:
1637     switch (Width) {
1638     default:
1639       return 0;
1640     case 2:
1641       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1642     case 3:
1643       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1644     case 4:
1645       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1646     }
1647   case FLAT_LOAD:
1648     switch (Width) {
1649     default:
1650       return 0;
1651     case 2:
1652       return AMDGPU::FLAT_LOAD_DWORDX2;
1653     case 3:
1654       return AMDGPU::FLAT_LOAD_DWORDX3;
1655     case 4:
1656       return AMDGPU::FLAT_LOAD_DWORDX4;
1657     }
1658   case FLAT_STORE:
1659     switch (Width) {
1660     default:
1661       return 0;
1662     case 2:
1663       return AMDGPU::FLAT_STORE_DWORDX2;
1664     case 3:
1665       return AMDGPU::FLAT_STORE_DWORDX3;
1666     case 4:
1667       return AMDGPU::FLAT_STORE_DWORDX4;
1668     }
1669   case MIMG:
1670     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1671            "No overlaps");
1672     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1673   }
1674 }
1675 
1676 std::pair<unsigned, unsigned>
1677 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1678                                     const CombineInfo &Paired) {
1679   assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1680                                    CI.Width + Paired.Width)) &&
1681          "No overlaps");
1682 
1683   unsigned Idx0;
1684   unsigned Idx1;
1685 
1686   static const unsigned Idxs[5][4] = {
1687       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1688       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1689       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1690       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1691       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1692   };
1693 
1694   assert(CI.Width >= 1 && CI.Width <= 4);
1695   assert(Paired.Width >= 1 && Paired.Width <= 4);
1696 
1697   if (Paired < CI) {
1698     Idx1 = Idxs[0][Paired.Width - 1];
1699     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1700   } else {
1701     Idx0 = Idxs[0][CI.Width - 1];
1702     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1703   }
1704 
1705   return std::make_pair(Idx0, Idx1);
1706 }
1707 
1708 const TargetRegisterClass *
1709 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1710                                              const CombineInfo &Paired) {
1711   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1712     switch (CI.Width + Paired.Width) {
1713     default:
1714       return nullptr;
1715     case 2:
1716       return &AMDGPU::SReg_64_XEXECRegClass;
1717     case 4:
1718       return &AMDGPU::SGPR_128RegClass;
1719     case 8:
1720       return &AMDGPU::SGPR_256RegClass;
1721     case 16:
1722       return &AMDGPU::SGPR_512RegClass;
1723     }
1724   }
1725 
1726   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1727   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1728              ? TRI->getAGPRClassForBitWidth(BitWidth)
1729              : TRI->getVGPRClassForBitWidth(BitWidth);
1730 }
1731 
1732 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1733     CombineInfo &CI, CombineInfo &Paired,
1734     MachineBasicBlock::iterator InsertBefore) {
1735   MachineBasicBlock *MBB = CI.I->getParent();
1736   DebugLoc DL = CI.I->getDebugLoc();
1737 
1738   const unsigned Opcode = getNewOpcode(CI, Paired);
1739 
1740   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1741   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1742   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1743 
1744   // Copy to the new source register.
1745   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1746   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1747 
1748   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1749   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1750 
1751   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1752       .add(*Src0)
1753       .addImm(SubRegIdx0)
1754       .add(*Src1)
1755       .addImm(SubRegIdx1);
1756 
1757   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1758                  .addReg(SrcReg, RegState::Kill);
1759 
1760   AddressRegs Regs = getRegs(Opcode, *TII);
1761 
1762   if (Regs.VAddr)
1763     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1764 
1765 
1766   // It shouldn't be possible to get this far if the two instructions
1767   // don't have a single memoperand, because MachineInstr::mayAlias()
1768   // will return true if this is the case.
1769   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1770 
1771   MachineInstr *New =
1772     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1773         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1774         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1775         .addImm(CI.CPol)      // cpol
1776         .addImm(0)            // tfe
1777         .addImm(0)            // swz
1778         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1779 
1780   CI.I->eraseFromParent();
1781   Paired.I->eraseFromParent();
1782   return New;
1783 }
1784 
1785 MachineOperand
1786 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1787   APInt V(32, Val, true);
1788   if (TII->isInlineConstant(V))
1789     return MachineOperand::CreateImm(Val);
1790 
1791   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1792   MachineInstr *Mov =
1793   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1794           TII->get(AMDGPU::S_MOV_B32), Reg)
1795     .addImm(Val);
1796   (void)Mov;
1797   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1798   return MachineOperand::CreateReg(Reg, false);
1799 }
1800 
1801 // Compute base address using Addr and return the final register.
1802 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1803                                            const MemAddress &Addr) const {
1804   MachineBasicBlock *MBB = MI.getParent();
1805   MachineBasicBlock::iterator MBBI = MI.getIterator();
1806   DebugLoc DL = MI.getDebugLoc();
1807 
1808   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1809           Addr.Base.LoSubReg) &&
1810          "Expected 32-bit Base-Register-Low!!");
1811 
1812   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1813           Addr.Base.HiSubReg) &&
1814          "Expected 32-bit Base-Register-Hi!!");
1815 
1816   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1817   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1818   MachineOperand OffsetHi =
1819     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1820 
1821   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1822   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1823   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1824 
1825   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1826   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1827   MachineInstr *LoHalf =
1828     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1829       .addReg(CarryReg, RegState::Define)
1830       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1831       .add(OffsetLo)
1832       .addImm(0); // clamp bit
1833   (void)LoHalf;
1834   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1835 
1836   MachineInstr *HiHalf =
1837   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1838     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1839     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1840     .add(OffsetHi)
1841     .addReg(CarryReg, RegState::Kill)
1842     .addImm(0); // clamp bit
1843   (void)HiHalf;
1844   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1845 
1846   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1847   MachineInstr *FullBase =
1848     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1849       .addReg(DestSub0)
1850       .addImm(AMDGPU::sub0)
1851       .addReg(DestSub1)
1852       .addImm(AMDGPU::sub1);
1853   (void)FullBase;
1854   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1855 
1856   return FullDestReg;
1857 }
1858 
1859 // Update base and offset with the NewBase and NewOffset in MI.
1860 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1861                                                Register NewBase,
1862                                                int32_t NewOffset) const {
1863   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1864   Base->setReg(NewBase);
1865   Base->setIsKill(false);
1866   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1867 }
1868 
1869 Optional<int32_t>
1870 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1871   if (Op.isImm())
1872     return Op.getImm();
1873 
1874   if (!Op.isReg())
1875     return None;
1876 
1877   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1878   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1879       !Def->getOperand(1).isImm())
1880     return None;
1881 
1882   return Def->getOperand(1).getImm();
1883 }
1884 
1885 // Analyze Base and extracts:
1886 //  - 32bit base registers, subregisters
1887 //  - 64bit constant offset
1888 // Expecting base computation as:
1889 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1890 //   %LO:vgpr_32, %c:sreg_64_xexec =
1891 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1892 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1893 //   %Base:vreg_64 =
1894 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1895 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1896                                                       MemAddress &Addr) const {
1897   if (!Base.isReg())
1898     return;
1899 
1900   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1901   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1902       || Def->getNumOperands() != 5)
1903     return;
1904 
1905   MachineOperand BaseLo = Def->getOperand(1);
1906   MachineOperand BaseHi = Def->getOperand(3);
1907   if (!BaseLo.isReg() || !BaseHi.isReg())
1908     return;
1909 
1910   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1911   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1912 
1913   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1914       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1915     return;
1916 
1917   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1918   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1919 
1920   auto Offset0P = extractConstOffset(*Src0);
1921   if (Offset0P)
1922     BaseLo = *Src1;
1923   else {
1924     if (!(Offset0P = extractConstOffset(*Src1)))
1925       return;
1926     BaseLo = *Src0;
1927   }
1928 
1929   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1930   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1931 
1932   if (Src0->isImm())
1933     std::swap(Src0, Src1);
1934 
1935   if (!Src1->isImm())
1936     return;
1937 
1938   uint64_t Offset1 = Src1->getImm();
1939   BaseHi = *Src0;
1940 
1941   Addr.Base.LoReg = BaseLo.getReg();
1942   Addr.Base.HiReg = BaseHi.getReg();
1943   Addr.Base.LoSubReg = BaseLo.getSubReg();
1944   Addr.Base.HiSubReg = BaseHi.getSubReg();
1945   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1946 }
1947 
1948 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1949     MachineInstr &MI,
1950     MemInfoMap &Visited,
1951     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1952 
1953   if (!(MI.mayLoad() ^ MI.mayStore()))
1954     return false;
1955 
1956   // TODO: Support flat and scratch.
1957   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1958     return false;
1959 
1960   if (MI.mayLoad() &&
1961       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1962     return false;
1963 
1964   if (AnchorList.count(&MI))
1965     return false;
1966 
1967   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1968 
1969   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1970     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1971     return false;
1972   }
1973 
1974   // Step1: Find the base-registers and a 64bit constant offset.
1975   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1976   MemAddress MAddr;
1977   if (Visited.find(&MI) == Visited.end()) {
1978     processBaseWithConstOffset(Base, MAddr);
1979     Visited[&MI] = MAddr;
1980   } else
1981     MAddr = Visited[&MI];
1982 
1983   if (MAddr.Offset == 0) {
1984     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1985                          " constant offsets that can be promoted.\n";);
1986     return false;
1987   }
1988 
1989   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1990              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1991 
1992   // Step2: Traverse through MI's basic block and find an anchor(that has the
1993   // same base-registers) with the highest 13bit distance from MI's offset.
1994   // E.g. (64bit loads)
1995   // bb:
1996   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1997   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1998   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1999   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2000   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2001   //
2002   // Starting from the first load, the optimization will try to find a new base
2003   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2004   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2005   // as the new-base(anchor) because of the maximum distance which can
2006   // accommodate more intermediate bases presumably.
2007   //
2008   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2009   // (&a + 8192) for load1, load2, load4.
2010   //   addr = &a + 8192
2011   //   load1 = load(addr,       -4096)
2012   //   load2 = load(addr,       -2048)
2013   //   load3 = load(addr,       0)
2014   //   load4 = load(addr,       2048)
2015   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2016   //
2017   MachineInstr *AnchorInst = nullptr;
2018   MemAddress AnchorAddr;
2019   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2020   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2021 
2022   MachineBasicBlock *MBB = MI.getParent();
2023   MachineBasicBlock::iterator E = MBB->end();
2024   MachineBasicBlock::iterator MBBI = MI.getIterator();
2025   ++MBBI;
2026   const SITargetLowering *TLI =
2027     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2028 
2029   for ( ; MBBI != E; ++MBBI) {
2030     MachineInstr &MINext = *MBBI;
2031     // TODO: Support finding an anchor(with same base) from store addresses or
2032     // any other load addresses where the opcodes are different.
2033     if (MINext.getOpcode() != MI.getOpcode() ||
2034         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2035       continue;
2036 
2037     const MachineOperand &BaseNext =
2038       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2039     MemAddress MAddrNext;
2040     if (Visited.find(&MINext) == Visited.end()) {
2041       processBaseWithConstOffset(BaseNext, MAddrNext);
2042       Visited[&MINext] = MAddrNext;
2043     } else
2044       MAddrNext = Visited[&MINext];
2045 
2046     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2047         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2048         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2049         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2050       continue;
2051 
2052     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2053 
2054     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2055     TargetLoweringBase::AddrMode AM;
2056     AM.HasBaseReg = true;
2057     AM.BaseOffs = Dist;
2058     if (TLI->isLegalGlobalAddressingMode(AM) &&
2059         (uint32_t)std::abs(Dist) > MaxDist) {
2060       MaxDist = std::abs(Dist);
2061 
2062       AnchorAddr = MAddrNext;
2063       AnchorInst = &MINext;
2064     }
2065   }
2066 
2067   if (AnchorInst) {
2068     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2069                AnchorInst->dump());
2070     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2071                <<  AnchorAddr.Offset << "\n\n");
2072 
2073     // Instead of moving up, just re-compute anchor-instruction's base address.
2074     Register Base = computeBase(MI, AnchorAddr);
2075 
2076     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2077     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2078 
2079     for (auto P : InstsWCommonBase) {
2080       TargetLoweringBase::AddrMode AM;
2081       AM.HasBaseReg = true;
2082       AM.BaseOffs = P.second - AnchorAddr.Offset;
2083 
2084       if (TLI->isLegalGlobalAddressingMode(AM)) {
2085         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2086                    dbgs() << ")"; P.first->dump());
2087         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2088         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2089       }
2090     }
2091     AnchorList.insert(AnchorInst);
2092     return true;
2093   }
2094 
2095   return false;
2096 }
2097 
2098 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2099                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2100   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2101     if (AddrList.front().InstClass == CI.InstClass &&
2102         AddrList.front().IsAGPR == CI.IsAGPR &&
2103         AddrList.front().hasSameBaseAddress(*CI.I)) {
2104       AddrList.emplace_back(CI);
2105       return;
2106     }
2107   }
2108 
2109   // Base address not found, so add a new list.
2110   MergeableInsts.emplace_back(1, CI);
2111 }
2112 
2113 std::pair<MachineBasicBlock::iterator, bool>
2114 SILoadStoreOptimizer::collectMergeableInsts(
2115     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2116     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2117     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2118   bool Modified = false;
2119 
2120   // Sort potential mergeable instructions into lists.  One list per base address.
2121   unsigned Order = 0;
2122   MachineBasicBlock::iterator BlockI = Begin;
2123   for (; BlockI != End; ++BlockI) {
2124     MachineInstr &MI = *BlockI;
2125 
2126     // We run this before checking if an address is mergeable, because it can produce
2127     // better code even if the instructions aren't mergeable.
2128     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2129       Modified = true;
2130 
2131     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2132     // barriers. We can look after this barrier for separate merges.
2133     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2134       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2135 
2136       // Search will resume after this instruction in a separate merge list.
2137       ++BlockI;
2138       break;
2139     }
2140 
2141     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2142     if (InstClass == UNKNOWN)
2143       continue;
2144 
2145     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2146     int Swizzled =
2147         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2148     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2149       continue;
2150 
2151     CombineInfo CI;
2152     CI.setMI(MI, *this);
2153     CI.Order = Order++;
2154 
2155     if (!CI.hasMergeableAddress(*MRI))
2156       continue;
2157 
2158     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2159       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2160       //        operands. However we are reporting that ds_write2 shall have
2161       //        only VGPR data so that machine copy propagation does not
2162       //        create an illegal instruction with a VGPR and AGPR sources.
2163       //        Consequenctially if we create such instruction the verifier
2164       //        will complain.
2165       continue;
2166     }
2167 
2168     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2169 
2170     addInstToMergeableList(CI, MergeableInsts);
2171   }
2172 
2173   // At this point we have lists of Mergeable instructions.
2174   //
2175   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2176   // list try to find an instruction that can be merged with I.  If an instruction
2177   // is found, it is stored in the Paired field.  If no instructions are found, then
2178   // the CombineInfo object is deleted from the list.
2179 
2180   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2181                                                    E = MergeableInsts.end(); I != E;) {
2182 
2183     std::list<CombineInfo> &MergeList = *I;
2184     if (MergeList.size() <= 1) {
2185       // This means we have found only one instruction with a given address
2186       // that can be merged, and we need at least 2 instructions to do a merge,
2187       // so this list can be discarded.
2188       I = MergeableInsts.erase(I);
2189       continue;
2190     }
2191 
2192     // Sort the lists by offsets, this way mergeable instructions will be
2193     // adjacent to each other in the list, which will make it easier to find
2194     // matches.
2195     MergeList.sort(
2196         [] (const CombineInfo &A, const CombineInfo &B) {
2197           return A.Offset < B.Offset;
2198         });
2199     ++I;
2200   }
2201 
2202   return std::make_pair(BlockI, Modified);
2203 }
2204 
2205 // Scan through looking for adjacent LDS operations with constant offsets from
2206 // the same base register. We rely on the scheduler to do the hard work of
2207 // clustering nearby loads, and assume these are all adjacent.
2208 bool SILoadStoreOptimizer::optimizeBlock(
2209                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2210   bool Modified = false;
2211 
2212   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2213                                                    E = MergeableInsts.end(); I != E;) {
2214     std::list<CombineInfo> &MergeList = *I;
2215 
2216     bool OptimizeListAgain = false;
2217     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2218       // We weren't able to make any changes, so delete the list so we don't
2219       // process the same instructions the next time we try to optimize this
2220       // block.
2221       I = MergeableInsts.erase(I);
2222       continue;
2223     }
2224 
2225     Modified = true;
2226 
2227     // We made changes, but also determined that there were no more optimization
2228     // opportunities, so we don't need to reprocess the list
2229     if (!OptimizeListAgain) {
2230       I = MergeableInsts.erase(I);
2231       continue;
2232     }
2233     OptimizeAgain = true;
2234   }
2235   return Modified;
2236 }
2237 
2238 bool
2239 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2240                                           std::list<CombineInfo> &MergeList,
2241                                           bool &OptimizeListAgain) {
2242   if (MergeList.empty())
2243     return false;
2244 
2245   bool Modified = false;
2246 
2247   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2248        Next = std::next(I)) {
2249 
2250     auto First = I;
2251     auto Second = Next;
2252 
2253     if ((*First).Order > (*Second).Order)
2254       std::swap(First, Second);
2255     CombineInfo &CI = *First;
2256     CombineInfo &Paired = *Second;
2257 
2258     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2259     if (!Where) {
2260       ++I;
2261       continue;
2262     }
2263 
2264     Modified = true;
2265 
2266     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2267 
2268     MachineBasicBlock::iterator NewMI;
2269     switch (CI.InstClass) {
2270     default:
2271       llvm_unreachable("unknown InstClass");
2272       break;
2273     case DS_READ:
2274       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2275       break;
2276     case DS_WRITE:
2277       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2278       break;
2279     case S_BUFFER_LOAD_IMM:
2280       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2281       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2282       break;
2283     case BUFFER_LOAD:
2284       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2285       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2286       break;
2287     case BUFFER_STORE:
2288       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2289       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2290       break;
2291     case MIMG:
2292       NewMI = mergeImagePair(CI, Paired, Where->I);
2293       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2294       break;
2295     case TBUFFER_LOAD:
2296       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2297       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2298       break;
2299     case TBUFFER_STORE:
2300       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2301       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2302       break;
2303     case FLAT_LOAD:
2304     case GLOBAL_LOAD:
2305     case GLOBAL_LOAD_SADDR:
2306       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2307       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2308       break;
2309     case FLAT_STORE:
2310     case GLOBAL_STORE:
2311     case GLOBAL_STORE_SADDR:
2312       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2313       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2314       break;
2315     }
2316     CI.setMI(NewMI, *this);
2317     CI.Order = Where->Order;
2318     if (I == Second)
2319       I = Next;
2320 
2321     MergeList.erase(Second);
2322   }
2323 
2324   return Modified;
2325 }
2326 
2327 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2328   if (skipFunction(MF.getFunction()))
2329     return false;
2330 
2331   STM = &MF.getSubtarget<GCNSubtarget>();
2332   if (!STM->loadStoreOptEnabled())
2333     return false;
2334 
2335   TII = STM->getInstrInfo();
2336   TRI = &TII->getRegisterInfo();
2337 
2338   MRI = &MF.getRegInfo();
2339   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2340 
2341   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2342 
2343   bool Modified = false;
2344 
2345   // Contains the list of instructions for which constant offsets are being
2346   // promoted to the IMM. This is tracked for an entire block at time.
2347   SmallPtrSet<MachineInstr *, 4> AnchorList;
2348   MemInfoMap Visited;
2349 
2350   for (MachineBasicBlock &MBB : MF) {
2351     MachineBasicBlock::iterator SectionEnd;
2352     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2353          I = SectionEnd) {
2354       bool CollectModified;
2355       std::list<std::list<CombineInfo>> MergeableInsts;
2356 
2357       // First pass: Collect list of all instructions we know how to merge in a
2358       // subset of the block.
2359       std::tie(SectionEnd, CollectModified) =
2360           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2361 
2362       Modified |= CollectModified;
2363 
2364       do {
2365         OptimizeAgain = false;
2366         Modified |= optimizeBlock(MergeableInsts);
2367       } while (OptimizeAgain);
2368     }
2369 
2370     Visited.clear();
2371     AnchorList.clear();
2372   }
2373 
2374   return Modified;
2375 }
2376