1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   BUFFER_LOAD,
78   BUFFER_STORE,
79   MIMG,
80   TBUFFER_LOAD,
81   TBUFFER_STORE,
82   GLOBAL_LOAD,
83   GLOBAL_LOAD_SADDR
84 };
85 
86 struct AddressRegs {
87   unsigned char NumVAddrs = 0;
88   bool SBase = false;
89   bool SRsrc = false;
90   bool SOffset = false;
91   bool SAddr = false;
92   bool VAddr = false;
93   bool Addr = false;
94   bool SSamp = false;
95 };
96 
97 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
98 const unsigned MaxAddressRegs = 12 + 1 + 1;
99 
100 class SILoadStoreOptimizer : public MachineFunctionPass {
101   struct CombineInfo {
102     MachineBasicBlock::iterator I;
103     unsigned EltSize;
104     unsigned Offset;
105     unsigned Width;
106     unsigned Format;
107     unsigned BaseOff;
108     unsigned DMask;
109     InstClassEnum InstClass;
110     unsigned CPol = 0;
111     bool IsAGPR;
112     bool UseST64;
113     int AddrIdx[MaxAddressRegs];
114     const MachineOperand *AddrReg[MaxAddressRegs];
115     unsigned NumAddresses;
116     unsigned Order;
117 
118     bool hasSameBaseAddress(const MachineInstr &MI) {
119       for (unsigned i = 0; i < NumAddresses; i++) {
120         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
121 
122         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
123           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
124               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
125             return false;
126           }
127           continue;
128         }
129 
130         // Check same base pointer. Be careful of subregisters, which can occur
131         // with vectors of pointers.
132         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
133             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
134          return false;
135         }
136       }
137       return true;
138     }
139 
140     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
141       for (unsigned i = 0; i < NumAddresses; ++i) {
142         const MachineOperand *AddrOp = AddrReg[i];
143         // Immediates are always OK.
144         if (AddrOp->isImm())
145           continue;
146 
147         // Don't try to merge addresses that aren't either immediates or registers.
148         // TODO: Should be possible to merge FrameIndexes and maybe some other
149         // non-register
150         if (!AddrOp->isReg())
151           return false;
152 
153         // TODO: We should be able to merge physical reg addresses.
154         if (AddrOp->getReg().isPhysical())
155           return false;
156 
157         // If an address has only one use then there will be on other
158         // instructions with the same address, so we can't merge this one.
159         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
160           return false;
161       }
162       return true;
163     }
164 
165     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
166   };
167 
168   struct BaseRegisters {
169     Register LoReg;
170     Register HiReg;
171 
172     unsigned LoSubReg = 0;
173     unsigned HiSubReg = 0;
174   };
175 
176   struct MemAddress {
177     BaseRegisters Base;
178     int64_t Offset = 0;
179   };
180 
181   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
182 
183 private:
184   const GCNSubtarget *STM = nullptr;
185   const SIInstrInfo *TII = nullptr;
186   const SIRegisterInfo *TRI = nullptr;
187   MachineRegisterInfo *MRI = nullptr;
188   AliasAnalysis *AA = nullptr;
189   bool OptimizeAgain;
190 
191   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
192                            const DenseSet<Register> &ARegUses,
193                            const MachineInstr &A, const MachineInstr &B) const;
194   static bool dmasksCanBeCombined(const CombineInfo &CI,
195                                   const SIInstrInfo &TII,
196                                   const CombineInfo &Paired);
197   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
198                                    CombineInfo &Paired, bool Modify = false);
199   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
200                         const CombineInfo &Paired);
201   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
202   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
203                                                      const CombineInfo &Paired);
204   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
205                                                     const CombineInfo &Paired);
206   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
207 
208   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
209 
210   unsigned read2Opcode(unsigned EltSize) const;
211   unsigned read2ST64Opcode(unsigned EltSize) const;
212   MachineBasicBlock::iterator
213   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
214                  MachineBasicBlock::iterator InsertBefore);
215 
216   unsigned write2Opcode(unsigned EltSize) const;
217   unsigned write2ST64Opcode(unsigned EltSize) const;
218   MachineBasicBlock::iterator
219   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
220                   MachineBasicBlock::iterator InsertBefore);
221   MachineBasicBlock::iterator
222   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
223                  MachineBasicBlock::iterator InsertBefore);
224   MachineBasicBlock::iterator
225   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
226                           MachineBasicBlock::iterator InsertBefore);
227   MachineBasicBlock::iterator
228   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
229                       MachineBasicBlock::iterator InsertBefore);
230   MachineBasicBlock::iterator
231   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
232                        MachineBasicBlock::iterator InsertBefore);
233   MachineBasicBlock::iterator
234   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
235                        MachineBasicBlock::iterator InsertBefore);
236   MachineBasicBlock::iterator
237   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
238                         MachineBasicBlock::iterator InsertBefore);
239   MachineBasicBlock::iterator
240   mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
241                       MachineBasicBlock::iterator InsertBefore);
242 
243   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
244                            int32_t NewOffset) const;
245   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
246   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
247   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
248   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
249   /// Promotes constant offset to the immediate by adjusting the base. It
250   /// tries to use a base from the nearby instructions that allows it to have
251   /// a 13bit constant offset which gets promoted to the immediate.
252   bool promoteConstantOffsetToImm(MachineInstr &CI,
253                                   MemInfoMap &Visited,
254                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
255   void addInstToMergeableList(const CombineInfo &CI,
256                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
257 
258   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
259       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
260       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
261       std::list<std::list<CombineInfo>> &MergeableInsts) const;
262 
263 public:
264   static char ID;
265 
266   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
267     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
268   }
269 
270   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
271                                      bool &OptimizeListAgain);
272   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
273 
274   bool runOnMachineFunction(MachineFunction &MF) override;
275 
276   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
277 
278   void getAnalysisUsage(AnalysisUsage &AU) const override {
279     AU.setPreservesCFG();
280     AU.addRequired<AAResultsWrapperPass>();
281 
282     MachineFunctionPass::getAnalysisUsage(AU);
283   }
284 
285   MachineFunctionProperties getRequiredProperties() const override {
286     return MachineFunctionProperties()
287       .set(MachineFunctionProperties::Property::IsSSA);
288   }
289 };
290 
291 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
292   const unsigned Opc = MI.getOpcode();
293 
294   if (TII.isMUBUF(Opc)) {
295     // FIXME: Handle d16 correctly
296     return AMDGPU::getMUBUFElements(Opc);
297   }
298   if (TII.isMIMG(MI)) {
299     uint64_t DMaskImm =
300         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
301     return countPopulation(DMaskImm);
302   }
303   if (TII.isMTBUF(Opc)) {
304     return AMDGPU::getMTBUFElements(Opc);
305   }
306 
307   switch (Opc) {
308   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
309   case AMDGPU::GLOBAL_LOAD_DWORD:
310   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
311     return 1;
312   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
313   case AMDGPU::GLOBAL_LOAD_DWORDX2:
314   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
315     return 2;
316   case AMDGPU::GLOBAL_LOAD_DWORDX3:
317   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
318     return 3;
319   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
320   case AMDGPU::GLOBAL_LOAD_DWORDX4:
321   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
322     return 4;
323   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
324     return 8;
325   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
326   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
327   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
328   case AMDGPU::DS_WRITE_B32_gfx9:
329     return 1;
330   case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
331   case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
332   case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
333   case AMDGPU::DS_WRITE_B64_gfx9:
334     return 2;
335   default:
336     return 0;
337   }
338 }
339 
340 /// Maps instruction opcode to enum InstClassEnum.
341 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
342   switch (Opc) {
343   default:
344     if (TII.isMUBUF(Opc)) {
345       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
346       default:
347         return UNKNOWN;
348       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
349       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
350       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
351       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
352         return BUFFER_LOAD;
353       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
354       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
355       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
356       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
357         return BUFFER_STORE;
358       }
359     }
360     if (TII.isMIMG(Opc)) {
361       // Ignore instructions encoded without vaddr.
362       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
363           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
364         return UNKNOWN;
365       // Ignore BVH instructions
366       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
367         return UNKNOWN;
368       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
369       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
370           TII.isGather4(Opc))
371         return UNKNOWN;
372       return MIMG;
373     }
374     if (TII.isMTBUF(Opc)) {
375       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
376       default:
377         return UNKNOWN;
378       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
379       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
380       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
381       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
382         return TBUFFER_LOAD;
383       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
384       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
385       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
386       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
387         return TBUFFER_STORE;
388       }
389     }
390     return UNKNOWN;
391   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
392   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
393   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
394   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
395     return S_BUFFER_LOAD_IMM;
396   case AMDGPU::DS_READ_B32:
397   case AMDGPU::DS_READ_B32_gfx9:
398   case AMDGPU::DS_READ_B64:
399   case AMDGPU::DS_READ_B64_gfx9:
400     return DS_READ;
401   case AMDGPU::DS_WRITE_B32:
402   case AMDGPU::DS_WRITE_B32_gfx9:
403   case AMDGPU::DS_WRITE_B64:
404   case AMDGPU::DS_WRITE_B64_gfx9:
405     return DS_WRITE;
406   case AMDGPU::GLOBAL_LOAD_DWORD:
407   case AMDGPU::GLOBAL_LOAD_DWORDX2:
408   case AMDGPU::GLOBAL_LOAD_DWORDX3:
409   case AMDGPU::GLOBAL_LOAD_DWORDX4:
410     return GLOBAL_LOAD;
411   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
412   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
413   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
414   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
415     return GLOBAL_LOAD_SADDR;
416   }
417 }
418 
419 /// Determines instruction subclass from opcode. Only instructions
420 /// of the same subclass can be merged together. The merged instruction may have
421 /// a different subclass but must have the same class.
422 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
423   switch (Opc) {
424   default:
425     if (TII.isMUBUF(Opc))
426       return AMDGPU::getMUBUFBaseOpcode(Opc);
427     if (TII.isMIMG(Opc)) {
428       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
429       assert(Info);
430       return Info->BaseOpcode;
431     }
432     if (TII.isMTBUF(Opc))
433       return AMDGPU::getMTBUFBaseOpcode(Opc);
434     return -1;
435   case AMDGPU::DS_READ_B32:
436   case AMDGPU::DS_READ_B32_gfx9:
437   case AMDGPU::DS_READ_B64:
438   case AMDGPU::DS_READ_B64_gfx9:
439   case AMDGPU::DS_WRITE_B32:
440   case AMDGPU::DS_WRITE_B32_gfx9:
441   case AMDGPU::DS_WRITE_B64:
442   case AMDGPU::DS_WRITE_B64_gfx9:
443     return Opc;
444   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
449   case AMDGPU::GLOBAL_LOAD_DWORD:
450   case AMDGPU::GLOBAL_LOAD_DWORDX2:
451   case AMDGPU::GLOBAL_LOAD_DWORDX3:
452   case AMDGPU::GLOBAL_LOAD_DWORDX4:
453     return AMDGPU::GLOBAL_LOAD_DWORD;
454   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
455   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
456   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
457   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
458     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
459   }
460 }
461 
462 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
463   AddressRegs Result;
464 
465   if (TII.isMUBUF(Opc)) {
466     if (AMDGPU::getMUBUFHasVAddr(Opc))
467       Result.VAddr = true;
468     if (AMDGPU::getMUBUFHasSrsrc(Opc))
469       Result.SRsrc = true;
470     if (AMDGPU::getMUBUFHasSoffset(Opc))
471       Result.SOffset = true;
472 
473     return Result;
474   }
475 
476   if (TII.isMIMG(Opc)) {
477     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478     if (VAddr0Idx >= 0) {
479       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
480       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
481     } else {
482       Result.VAddr = true;
483     }
484     Result.SRsrc = true;
485     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
486     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
487       Result.SSamp = true;
488 
489     return Result;
490   }
491   if (TII.isMTBUF(Opc)) {
492     if (AMDGPU::getMTBUFHasVAddr(Opc))
493       Result.VAddr = true;
494     if (AMDGPU::getMTBUFHasSrsrc(Opc))
495       Result.SRsrc = true;
496     if (AMDGPU::getMTBUFHasSoffset(Opc))
497       Result.SOffset = true;
498 
499     return Result;
500   }
501 
502   switch (Opc) {
503   default:
504     return Result;
505   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
506   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
507   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
508   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
509     Result.SBase = true;
510     return Result;
511   case AMDGPU::DS_READ_B32:
512   case AMDGPU::DS_READ_B64:
513   case AMDGPU::DS_READ_B32_gfx9:
514   case AMDGPU::DS_READ_B64_gfx9:
515   case AMDGPU::DS_WRITE_B32:
516   case AMDGPU::DS_WRITE_B64:
517   case AMDGPU::DS_WRITE_B32_gfx9:
518   case AMDGPU::DS_WRITE_B64_gfx9:
519     Result.Addr = true;
520     return Result;
521   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
522   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
523   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
524   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
525     Result.SAddr = true;
526     LLVM_FALLTHROUGH;
527   case AMDGPU::GLOBAL_LOAD_DWORD:
528   case AMDGPU::GLOBAL_LOAD_DWORDX2:
529   case AMDGPU::GLOBAL_LOAD_DWORDX3:
530   case AMDGPU::GLOBAL_LOAD_DWORDX4:
531     Result.VAddr = true;
532     return Result;
533   }
534 }
535 
536 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
537                                               const SILoadStoreOptimizer &LSO) {
538   I = MI;
539   unsigned Opc = MI->getOpcode();
540   InstClass = getInstClass(Opc, *LSO.TII);
541 
542   if (InstClass == UNKNOWN)
543     return;
544 
545   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
546 
547   switch (InstClass) {
548   case DS_READ:
549    EltSize =
550           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
551                                                                           : 4;
552    break;
553   case DS_WRITE:
554     EltSize =
555           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
556                                                                             : 4;
557     break;
558   case S_BUFFER_LOAD_IMM:
559     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
560     break;
561   default:
562     EltSize = 4;
563     break;
564   }
565 
566   if (InstClass == MIMG) {
567     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
568     // Offset is not considered for MIMG instructions.
569     Offset = 0;
570   } else {
571     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
572     Offset = I->getOperand(OffsetIdx).getImm();
573   }
574 
575   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
576     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
577 
578   Width = getOpcodeWidth(*I, *LSO.TII);
579 
580   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
581     Offset &= 0xffff;
582   } else if (InstClass != MIMG) {
583     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
584   }
585 
586   AddressRegs Regs = getRegs(Opc, *LSO.TII);
587 
588   NumAddresses = 0;
589   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
590     AddrIdx[NumAddresses++] =
591         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
592   if (Regs.Addr)
593     AddrIdx[NumAddresses++] =
594         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
595   if (Regs.SBase)
596     AddrIdx[NumAddresses++] =
597         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
598   if (Regs.SRsrc)
599     AddrIdx[NumAddresses++] =
600         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
601   if (Regs.SOffset)
602     AddrIdx[NumAddresses++] =
603         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
604   if (Regs.SAddr)
605     AddrIdx[NumAddresses++] =
606         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
607   if (Regs.VAddr)
608     AddrIdx[NumAddresses++] =
609         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
610   if (Regs.SSamp)
611     AddrIdx[NumAddresses++] =
612         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
613   assert(NumAddresses <= MaxAddressRegs);
614 
615   for (unsigned J = 0; J < NumAddresses; J++)
616     AddrReg[J] = &I->getOperand(AddrIdx[J]);
617 }
618 
619 } // end anonymous namespace.
620 
621 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
622                       "SI Load Store Optimizer", false, false)
623 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
624 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
625                     false, false)
626 
627 char SILoadStoreOptimizer::ID = 0;
628 
629 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
630 
631 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
632   return new SILoadStoreOptimizer();
633 }
634 
635 static void addDefsUsesToList(const MachineInstr &MI,
636                               DenseSet<Register> &RegDefs,
637                               DenseSet<Register> &RegUses) {
638   for (const auto &Op : MI.operands()) {
639     if (!Op.isReg())
640       continue;
641     if (Op.isDef())
642       RegDefs.insert(Op.getReg());
643     if (Op.readsReg())
644       RegUses.insert(Op.getReg());
645   }
646 }
647 
648 bool SILoadStoreOptimizer::canSwapInstructions(
649     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
650     const MachineInstr &A, const MachineInstr &B) const {
651   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
652       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
653     return false;
654   for (const auto &BOp : B.operands()) {
655     if (!BOp.isReg())
656       continue;
657     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
658       return false;
659     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
660       return false;
661   }
662   return true;
663 }
664 
665 // This function assumes that \p A and \p B have are identical except for
666 // size and offset, and they reference adjacent memory.
667 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
668                                                    const MachineMemOperand *A,
669                                                    const MachineMemOperand *B) {
670   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
671   unsigned Size = A->getSize() + B->getSize();
672   // This function adds the offset parameter to the existing offset for A,
673   // so we pass 0 here as the offset and then manually set it to the correct
674   // value after the call.
675   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
676   MMO->setOffset(MinOffset);
677   return MMO;
678 }
679 
680 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
681                                                const SIInstrInfo &TII,
682                                                const CombineInfo &Paired) {
683   assert(CI.InstClass == MIMG);
684 
685   // Ignore instructions with tfe/lwe set.
686   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
687   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
688 
689   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
690     return false;
691 
692   // Check other optional immediate operands for equality.
693   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
694                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
695                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
696 
697   for (auto op : OperandsToMatch) {
698     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
699     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
700       return false;
701     if (Idx != -1 &&
702         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
703       return false;
704   }
705 
706   // Check DMask for overlaps.
707   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
708   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
709 
710   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
711   if ((1u << AllowedBitsForMin) <= MinMask)
712     return false;
713 
714   return true;
715 }
716 
717 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
718                                        unsigned ComponentCount,
719                                        const GCNSubtarget &STI) {
720   if (ComponentCount > 4)
721     return 0;
722 
723   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
724       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
725   if (!OldFormatInfo)
726     return 0;
727 
728   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
729       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
730                                            ComponentCount,
731                                            OldFormatInfo->NumFormat, STI);
732 
733   if (!NewFormatInfo)
734     return 0;
735 
736   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
737          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
738 
739   return NewFormatInfo->Format;
740 }
741 
742 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
743 // highest power of two. Note that the result is well defined for all inputs
744 // including corner cases like:
745 // - if Lo == Hi, return that value
746 // - if Lo == 0, return 0 (even though the "- 1" below underflows
747 // - if Lo > Hi, return 0 (as if the range wrapped around)
748 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
749   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
750 }
751 
752 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
753                                                 const GCNSubtarget &STI,
754                                                 CombineInfo &Paired,
755                                                 bool Modify) {
756   assert(CI.InstClass != MIMG);
757 
758   // XXX - Would the same offset be OK? Is there any reason this would happen or
759   // be useful?
760   if (CI.Offset == Paired.Offset)
761     return false;
762 
763   // This won't be valid if the offset isn't aligned.
764   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
765     return false;
766 
767   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
768 
769     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
770         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
771     if (!Info0)
772       return false;
773     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
774         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
775     if (!Info1)
776       return false;
777 
778     if (Info0->BitsPerComp != Info1->BitsPerComp ||
779         Info0->NumFormat != Info1->NumFormat)
780       return false;
781 
782     // TODO: Should be possible to support more formats, but if format loads
783     // are not dword-aligned, the merged load might not be valid.
784     if (Info0->BitsPerComp != 32)
785       return false;
786 
787     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
788       return false;
789   }
790 
791   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
792   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
793   CI.UseST64 = false;
794   CI.BaseOff = 0;
795 
796   // Handle all non-DS instructions.
797   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
798     return (EltOffset0 + CI.Width == EltOffset1 ||
799             EltOffset1 + Paired.Width == EltOffset0) &&
800            CI.CPol == Paired.CPol;
801   }
802 
803   // If the offset in elements doesn't fit in 8-bits, we might be able to use
804   // the stride 64 versions.
805   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
806       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
807     if (Modify) {
808       CI.Offset = EltOffset0 / 64;
809       Paired.Offset = EltOffset1 / 64;
810       CI.UseST64 = true;
811     }
812     return true;
813   }
814 
815   // Check if the new offsets fit in the reduced 8-bit range.
816   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
817     if (Modify) {
818       CI.Offset = EltOffset0;
819       Paired.Offset = EltOffset1;
820     }
821     return true;
822   }
823 
824   // Try to shift base address to decrease offsets.
825   uint32_t Min = std::min(EltOffset0, EltOffset1);
826   uint32_t Max = std::max(EltOffset0, EltOffset1);
827 
828   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
829   if (((Max - Min) & ~Mask) == 0) {
830     if (Modify) {
831       // From the range of values we could use for BaseOff, choose the one that
832       // is aligned to the highest power of two, to maximise the chance that
833       // the same offset can be reused for other load/store pairs.
834       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
835       // Copy the low bits of the offsets, so that when we adjust them by
836       // subtracting BaseOff they will be multiples of 64.
837       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
838       CI.BaseOff = BaseOff * CI.EltSize;
839       CI.Offset = (EltOffset0 - BaseOff) / 64;
840       Paired.Offset = (EltOffset1 - BaseOff) / 64;
841       CI.UseST64 = true;
842     }
843     return true;
844   }
845 
846   if (isUInt<8>(Max - Min)) {
847     if (Modify) {
848       // From the range of values we could use for BaseOff, choose the one that
849       // is aligned to the highest power of two, to maximise the chance that
850       // the same offset can be reused for other load/store pairs.
851       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
852       CI.BaseOff = BaseOff * CI.EltSize;
853       CI.Offset = EltOffset0 - BaseOff;
854       Paired.Offset = EltOffset1 - BaseOff;
855     }
856     return true;
857   }
858 
859   return false;
860 }
861 
862 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
863                                      const CombineInfo &CI,
864                                      const CombineInfo &Paired) {
865   const unsigned Width = (CI.Width + Paired.Width);
866   switch (CI.InstClass) {
867   default:
868     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
869   case S_BUFFER_LOAD_IMM:
870     switch (Width) {
871     default:
872       return false;
873     case 2:
874     case 4:
875     case 8:
876       return true;
877     }
878   }
879 }
880 
881 const TargetRegisterClass *
882 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
883   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
884     return TRI->getRegClassForReg(*MRI, Dst->getReg());
885   }
886   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
887     return TRI->getRegClassForReg(*MRI, Src->getReg());
888   }
889   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
890     return TRI->getRegClassForReg(*MRI, Src->getReg());
891   }
892   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
893     return TRI->getRegClassForReg(*MRI, Dst->getReg());
894   }
895   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
896     return TRI->getRegClassForReg(*MRI, Src->getReg());
897   }
898   return nullptr;
899 }
900 
901 /// This function assumes that CI comes before Paired in a basic block. Return
902 /// an insertion point for the merged instruction or nullptr on failure.
903 SILoadStoreOptimizer::CombineInfo *
904 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
905                                            CombineInfo &Paired) {
906   // If another instruction has already been merged into CI, it may now be a
907   // type that we can't do any further merging into.
908   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
909     return nullptr;
910   assert(CI.InstClass == Paired.InstClass);
911 
912   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
913       getInstSubclass(Paired.I->getOpcode(), *TII))
914     return nullptr;
915 
916   // Check both offsets (or masks for MIMG) can be combined and fit in the
917   // reduced range.
918   if (CI.InstClass == MIMG) {
919     if (!dmasksCanBeCombined(CI, *TII, Paired))
920       return nullptr;
921   } else {
922     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
923       return nullptr;
924   }
925 
926   DenseSet<Register> RegDefs;
927   DenseSet<Register> RegUses;
928   CombineInfo *Where;
929   if (CI.I->mayLoad()) {
930     // Try to hoist Paired up to CI.
931     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
932     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
933       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
934         return nullptr;
935     }
936     Where = &CI;
937   } else {
938     // Try to sink CI down to Paired.
939     addDefsUsesToList(*CI.I, RegDefs, RegUses);
940     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
941       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
942         return nullptr;
943     }
944     Where = &Paired;
945   }
946 
947   // Call offsetsCanBeCombined with modify = true so that the offsets are
948   // correct for the new instruction.  This should return true, because
949   // this function should only be called on CombineInfo objects that
950   // have already been confirmed to be mergeable.
951   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
952     offsetsCanBeCombined(CI, *STM, Paired, true);
953   return Where;
954 }
955 
956 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
957   if (STM->ldsRequiresM0Init())
958     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
959   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
960 }
961 
962 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
963   if (STM->ldsRequiresM0Init())
964     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
965 
966   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
967                         : AMDGPU::DS_READ2ST64_B64_gfx9;
968 }
969 
970 MachineBasicBlock::iterator
971 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
972                                      MachineBasicBlock::iterator InsertBefore) {
973   MachineBasicBlock *MBB = CI.I->getParent();
974 
975   // Be careful, since the addresses could be subregisters themselves in weird
976   // cases, like vectors of pointers.
977   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
978 
979   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
980   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
981 
982   unsigned NewOffset0 = CI.Offset;
983   unsigned NewOffset1 = Paired.Offset;
984   unsigned Opc =
985       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
986 
987   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
988   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
989 
990   if (NewOffset0 > NewOffset1) {
991     // Canonicalize the merged instruction so the smaller offset comes first.
992     std::swap(NewOffset0, NewOffset1);
993     std::swap(SubRegIdx0, SubRegIdx1);
994   }
995 
996   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
997          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
998 
999   const MCInstrDesc &Read2Desc = TII->get(Opc);
1000 
1001   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1002   Register DestReg = MRI->createVirtualRegister(SuperRC);
1003 
1004   DebugLoc DL = CI.I->getDebugLoc();
1005 
1006   Register BaseReg = AddrReg->getReg();
1007   unsigned BaseSubReg = AddrReg->getSubReg();
1008   unsigned BaseRegFlags = 0;
1009   if (CI.BaseOff) {
1010     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1011     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1012         .addImm(CI.BaseOff);
1013 
1014     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1015     BaseRegFlags = RegState::Kill;
1016 
1017     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1018         .addReg(ImmReg)
1019         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1020         .addImm(0); // clamp bit
1021     BaseSubReg = 0;
1022   }
1023 
1024   MachineInstrBuilder Read2 =
1025       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1026           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1027           .addImm(NewOffset0)                        // offset0
1028           .addImm(NewOffset1)                        // offset1
1029           .addImm(0)                                 // gds
1030           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1031 
1032   (void)Read2;
1033 
1034   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1035 
1036   // Copy to the old destination registers.
1037   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1038       .add(*Dest0) // Copy to same destination including flags and sub reg.
1039       .addReg(DestReg, 0, SubRegIdx0);
1040   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1041       .add(*Dest1)
1042       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1043 
1044   CI.I->eraseFromParent();
1045   Paired.I->eraseFromParent();
1046 
1047   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1048   return Read2;
1049 }
1050 
1051 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1052   if (STM->ldsRequiresM0Init())
1053     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1054   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1055                         : AMDGPU::DS_WRITE2_B64_gfx9;
1056 }
1057 
1058 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1059   if (STM->ldsRequiresM0Init())
1060     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1061                           : AMDGPU::DS_WRITE2ST64_B64;
1062 
1063   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1064                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1065 }
1066 
1067 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1068     CombineInfo &CI, CombineInfo &Paired,
1069     MachineBasicBlock::iterator InsertBefore) {
1070   MachineBasicBlock *MBB = CI.I->getParent();
1071 
1072   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1073   // sure we preserve the subregister index and any register flags set on them.
1074   const MachineOperand *AddrReg =
1075       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1076   const MachineOperand *Data0 =
1077       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1078   const MachineOperand *Data1 =
1079       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1080 
1081   unsigned NewOffset0 = CI.Offset;
1082   unsigned NewOffset1 = Paired.Offset;
1083   unsigned Opc =
1084       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1085 
1086   if (NewOffset0 > NewOffset1) {
1087     // Canonicalize the merged instruction so the smaller offset comes first.
1088     std::swap(NewOffset0, NewOffset1);
1089     std::swap(Data0, Data1);
1090   }
1091 
1092   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1093          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1094 
1095   const MCInstrDesc &Write2Desc = TII->get(Opc);
1096   DebugLoc DL = CI.I->getDebugLoc();
1097 
1098   Register BaseReg = AddrReg->getReg();
1099   unsigned BaseSubReg = AddrReg->getSubReg();
1100   unsigned BaseRegFlags = 0;
1101   if (CI.BaseOff) {
1102     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1103     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1104         .addImm(CI.BaseOff);
1105 
1106     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1107     BaseRegFlags = RegState::Kill;
1108 
1109     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1110         .addReg(ImmReg)
1111         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1112         .addImm(0); // clamp bit
1113     BaseSubReg = 0;
1114   }
1115 
1116   MachineInstrBuilder Write2 =
1117       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1118           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1119           .add(*Data0)                               // data0
1120           .add(*Data1)                               // data1
1121           .addImm(NewOffset0)                        // offset0
1122           .addImm(NewOffset1)                        // offset1
1123           .addImm(0)                                 // gds
1124           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1125 
1126   CI.I->eraseFromParent();
1127   Paired.I->eraseFromParent();
1128 
1129   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1130   return Write2;
1131 }
1132 
1133 MachineBasicBlock::iterator
1134 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1135                                      MachineBasicBlock::iterator InsertBefore) {
1136   MachineBasicBlock *MBB = CI.I->getParent();
1137   DebugLoc DL = CI.I->getDebugLoc();
1138   const unsigned Opcode = getNewOpcode(CI, Paired);
1139 
1140   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1141 
1142   Register DestReg = MRI->createVirtualRegister(SuperRC);
1143   unsigned MergedDMask = CI.DMask | Paired.DMask;
1144   unsigned DMaskIdx =
1145       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1146 
1147   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1148   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1149     if (I == DMaskIdx)
1150       MIB.addImm(MergedDMask);
1151     else
1152       MIB.add((*CI.I).getOperand(I));
1153   }
1154 
1155   // It shouldn't be possible to get this far if the two instructions
1156   // don't have a single memoperand, because MachineInstr::mayAlias()
1157   // will return true if this is the case.
1158   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1159 
1160   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1161   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1162 
1163   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1164 
1165   unsigned SubRegIdx0, SubRegIdx1;
1166   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1167 
1168   // Copy to the old destination registers.
1169   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1170   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1171   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1172 
1173   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1174       .add(*Dest0) // Copy to same destination including flags and sub reg.
1175       .addReg(DestReg, 0, SubRegIdx0);
1176   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1177       .add(*Dest1)
1178       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1179 
1180   CI.I->eraseFromParent();
1181   Paired.I->eraseFromParent();
1182   return New;
1183 }
1184 
1185 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1186     CombineInfo &CI, CombineInfo &Paired,
1187     MachineBasicBlock::iterator InsertBefore) {
1188   MachineBasicBlock *MBB = CI.I->getParent();
1189   DebugLoc DL = CI.I->getDebugLoc();
1190   const unsigned Opcode = getNewOpcode(CI, Paired);
1191 
1192   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1193 
1194   Register DestReg = MRI->createVirtualRegister(SuperRC);
1195   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1196 
1197   // It shouldn't be possible to get this far if the two instructions
1198   // don't have a single memoperand, because MachineInstr::mayAlias()
1199   // will return true if this is the case.
1200   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1201 
1202   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1203   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1204 
1205   MachineInstr *New =
1206       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1207           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1208           .addImm(MergedOffset) // offset
1209           .addImm(CI.CPol)      // cpol
1210           .addMemOperand(
1211               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1212 
1213   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1214   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1215   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1216 
1217   // Copy to the old destination registers.
1218   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1219   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1220   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1221 
1222   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1223       .add(*Dest0) // Copy to same destination including flags and sub reg.
1224       .addReg(DestReg, 0, SubRegIdx0);
1225   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1226       .add(*Dest1)
1227       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1228 
1229   CI.I->eraseFromParent();
1230   Paired.I->eraseFromParent();
1231   return New;
1232 }
1233 
1234 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1235     CombineInfo &CI, CombineInfo &Paired,
1236     MachineBasicBlock::iterator InsertBefore) {
1237   MachineBasicBlock *MBB = CI.I->getParent();
1238   DebugLoc DL = CI.I->getDebugLoc();
1239 
1240   const unsigned Opcode = getNewOpcode(CI, Paired);
1241 
1242   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1243 
1244   // Copy to the new source register.
1245   Register DestReg = MRI->createVirtualRegister(SuperRC);
1246   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1247 
1248   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1249 
1250   AddressRegs Regs = getRegs(Opcode, *TII);
1251 
1252   if (Regs.VAddr)
1253     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1254 
1255   // It shouldn't be possible to get this far if the two instructions
1256   // don't have a single memoperand, because MachineInstr::mayAlias()
1257   // will return true if this is the case.
1258   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1259 
1260   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1261   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1262 
1263   MachineInstr *New =
1264     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1265         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1266         .addImm(MergedOffset) // offset
1267         .addImm(CI.CPol)      // cpol
1268         .addImm(0)            // tfe
1269         .addImm(0)            // swz
1270         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1271 
1272   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1273   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1274   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1275 
1276   // Copy to the old destination registers.
1277   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1278   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1279   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1280 
1281   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1282       .add(*Dest0) // Copy to same destination including flags and sub reg.
1283       .addReg(DestReg, 0, SubRegIdx0);
1284   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1285       .add(*Dest1)
1286       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1287 
1288   CI.I->eraseFromParent();
1289   Paired.I->eraseFromParent();
1290   return New;
1291 }
1292 
1293 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1294     CombineInfo &CI, CombineInfo &Paired,
1295     MachineBasicBlock::iterator InsertBefore) {
1296   MachineBasicBlock *MBB = CI.I->getParent();
1297   DebugLoc DL = CI.I->getDebugLoc();
1298 
1299   const unsigned Opcode = getNewOpcode(CI, Paired);
1300 
1301   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1302 
1303   // Copy to the new source register.
1304   Register DestReg = MRI->createVirtualRegister(SuperRC);
1305   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1306 
1307   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1308 
1309   AddressRegs Regs = getRegs(Opcode, *TII);
1310 
1311   if (Regs.VAddr)
1312     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1313 
1314   unsigned JoinedFormat =
1315       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1316 
1317   // It shouldn't be possible to get this far if the two instructions
1318   // don't have a single memoperand, because MachineInstr::mayAlias()
1319   // will return true if this is the case.
1320   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1321 
1322   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1323   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1324 
1325   MachineInstr *New =
1326       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1327           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1328           .addImm(MergedOffset) // offset
1329           .addImm(JoinedFormat) // format
1330           .addImm(CI.CPol)      // cpol
1331           .addImm(0)            // tfe
1332           .addImm(0)            // swz
1333           .addMemOperand(
1334               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1335 
1336   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1337   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1338   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1339 
1340   // Copy to the old destination registers.
1341   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1342   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1343   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1344 
1345   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1346       .add(*Dest0) // Copy to same destination including flags and sub reg.
1347       .addReg(DestReg, 0, SubRegIdx0);
1348   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1349       .add(*Dest1)
1350       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1351 
1352   CI.I->eraseFromParent();
1353   Paired.I->eraseFromParent();
1354   return New;
1355 }
1356 
1357 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1358     CombineInfo &CI, CombineInfo &Paired,
1359     MachineBasicBlock::iterator InsertBefore) {
1360   MachineBasicBlock *MBB = CI.I->getParent();
1361   DebugLoc DL = CI.I->getDebugLoc();
1362 
1363   const unsigned Opcode = getNewOpcode(CI, Paired);
1364 
1365   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1366   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1367   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1368 
1369   // Copy to the new source register.
1370   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1371   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1372 
1373   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1374   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1375 
1376   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1377       .add(*Src0)
1378       .addImm(SubRegIdx0)
1379       .add(*Src1)
1380       .addImm(SubRegIdx1);
1381 
1382   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1383                  .addReg(SrcReg, RegState::Kill);
1384 
1385   AddressRegs Regs = getRegs(Opcode, *TII);
1386 
1387   if (Regs.VAddr)
1388     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1389 
1390   unsigned JoinedFormat =
1391       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1392 
1393   // It shouldn't be possible to get this far if the two instructions
1394   // don't have a single memoperand, because MachineInstr::mayAlias()
1395   // will return true if this is the case.
1396   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1397 
1398   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1399   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1400 
1401   MachineInstr *New =
1402       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1403           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1404           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1405           .addImm(JoinedFormat)                     // format
1406           .addImm(CI.CPol)                          // cpol
1407           .addImm(0)                                // tfe
1408           .addImm(0)                                // swz
1409           .addMemOperand(
1410               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1411 
1412   CI.I->eraseFromParent();
1413   Paired.I->eraseFromParent();
1414   return New;
1415 }
1416 
1417 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
1418     CombineInfo &CI, CombineInfo &Paired,
1419     MachineBasicBlock::iterator InsertBefore) {
1420   MachineBasicBlock *MBB = CI.I->getParent();
1421   DebugLoc DL = CI.I->getDebugLoc();
1422 
1423   const unsigned Opcode = getNewOpcode(CI, Paired);
1424 
1425   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1426   Register DestReg = MRI->createVirtualRegister(SuperRC);
1427 
1428   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1429 
1430   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1431     MIB.add(*SAddr);
1432 
1433   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1434   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1435 
1436   MachineInstr *New =
1437     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1438        .addImm(std::min(CI.Offset, Paired.Offset))
1439        .addImm(CI.CPol)
1440        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1441 
1442   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1443   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1444   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1445 
1446   // Copy to the old destination registers.
1447   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1448   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1449   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1450 
1451   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1452       .add(*Dest0) // Copy to same destination including flags and sub reg.
1453       .addReg(DestReg, 0, SubRegIdx0);
1454   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1455       .add(*Dest1)
1456       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1457 
1458   CI.I->eraseFromParent();
1459   Paired.I->eraseFromParent();
1460   return New;
1461 }
1462 
1463 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1464                                             const CombineInfo &Paired) {
1465   const unsigned Width = CI.Width + Paired.Width;
1466 
1467   switch (CI.InstClass) {
1468   default:
1469     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1470     // FIXME: Handle d16 correctly
1471     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1472                                   Width);
1473   case TBUFFER_LOAD:
1474   case TBUFFER_STORE:
1475     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1476                                   Width);
1477 
1478   case UNKNOWN:
1479     llvm_unreachable("Unknown instruction class");
1480   case S_BUFFER_LOAD_IMM:
1481     switch (Width) {
1482     default:
1483       return 0;
1484     case 2:
1485       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1486     case 4:
1487       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1488     case 8:
1489       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1490     }
1491   case GLOBAL_LOAD:
1492     switch (Width) {
1493     default:
1494       return 0;
1495     case 2:
1496       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1497     case 3:
1498       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1499     case 4:
1500       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1501     }
1502   case GLOBAL_LOAD_SADDR:
1503     switch (Width) {
1504     default:
1505       return 0;
1506     case 2:
1507       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1508     case 3:
1509       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1510     case 4:
1511       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1512     }
1513   case MIMG:
1514     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1515            "No overlaps");
1516     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1517   }
1518 }
1519 
1520 std::pair<unsigned, unsigned>
1521 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1522                                     const CombineInfo &Paired) {
1523   bool ReverseOrder;
1524   if (CI.InstClass == MIMG) {
1525     assert(
1526         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1527         "No overlaps");
1528     ReverseOrder = CI.DMask > Paired.DMask;
1529   } else {
1530     ReverseOrder = CI.Offset > Paired.Offset;
1531   }
1532 
1533   unsigned Idx0;
1534   unsigned Idx1;
1535 
1536   static const unsigned Idxs[5][4] = {
1537       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1538       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1539       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1540       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1541       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1542   };
1543 
1544   assert(CI.Width >= 1 && CI.Width <= 4);
1545   assert(Paired.Width >= 1 && Paired.Width <= 4);
1546 
1547   if (ReverseOrder) {
1548     Idx1 = Idxs[0][Paired.Width - 1];
1549     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1550   } else {
1551     Idx0 = Idxs[0][CI.Width - 1];
1552     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1553   }
1554 
1555   return std::make_pair(Idx0, Idx1);
1556 }
1557 
1558 const TargetRegisterClass *
1559 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1560                                              const CombineInfo &Paired) {
1561   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1562     switch (CI.Width + Paired.Width) {
1563     default:
1564       return nullptr;
1565     case 2:
1566       return &AMDGPU::SReg_64_XEXECRegClass;
1567     case 4:
1568       return &AMDGPU::SGPR_128RegClass;
1569     case 8:
1570       return &AMDGPU::SGPR_256RegClass;
1571     case 16:
1572       return &AMDGPU::SGPR_512RegClass;
1573     }
1574   }
1575 
1576   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1577   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1578              ? TRI->getAGPRClassForBitWidth(BitWidth)
1579              : TRI->getVGPRClassForBitWidth(BitWidth);
1580 }
1581 
1582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1583     CombineInfo &CI, CombineInfo &Paired,
1584     MachineBasicBlock::iterator InsertBefore) {
1585   MachineBasicBlock *MBB = CI.I->getParent();
1586   DebugLoc DL = CI.I->getDebugLoc();
1587 
1588   const unsigned Opcode = getNewOpcode(CI, Paired);
1589 
1590   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1591   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1592   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1593 
1594   // Copy to the new source register.
1595   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1596   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1597 
1598   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1599   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1600 
1601   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1602       .add(*Src0)
1603       .addImm(SubRegIdx0)
1604       .add(*Src1)
1605       .addImm(SubRegIdx1);
1606 
1607   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1608                  .addReg(SrcReg, RegState::Kill);
1609 
1610   AddressRegs Regs = getRegs(Opcode, *TII);
1611 
1612   if (Regs.VAddr)
1613     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1614 
1615 
1616   // It shouldn't be possible to get this far if the two instructions
1617   // don't have a single memoperand, because MachineInstr::mayAlias()
1618   // will return true if this is the case.
1619   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1620 
1621   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1622   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1623 
1624   MachineInstr *New =
1625     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1626         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1627         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1628         .addImm(CI.CPol)      // cpol
1629         .addImm(0)            // tfe
1630         .addImm(0)            // swz
1631         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1632 
1633   CI.I->eraseFromParent();
1634   Paired.I->eraseFromParent();
1635   return New;
1636 }
1637 
1638 MachineOperand
1639 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1640   APInt V(32, Val, true);
1641   if (TII->isInlineConstant(V))
1642     return MachineOperand::CreateImm(Val);
1643 
1644   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1645   MachineInstr *Mov =
1646   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1647           TII->get(AMDGPU::S_MOV_B32), Reg)
1648     .addImm(Val);
1649   (void)Mov;
1650   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1651   return MachineOperand::CreateReg(Reg, false);
1652 }
1653 
1654 // Compute base address using Addr and return the final register.
1655 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1656                                            const MemAddress &Addr) const {
1657   MachineBasicBlock *MBB = MI.getParent();
1658   MachineBasicBlock::iterator MBBI = MI.getIterator();
1659   DebugLoc DL = MI.getDebugLoc();
1660 
1661   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1662           Addr.Base.LoSubReg) &&
1663          "Expected 32-bit Base-Register-Low!!");
1664 
1665   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1666           Addr.Base.HiSubReg) &&
1667          "Expected 32-bit Base-Register-Hi!!");
1668 
1669   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1670   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1671   MachineOperand OffsetHi =
1672     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1673 
1674   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1675   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1676   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1677 
1678   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1679   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1680   MachineInstr *LoHalf =
1681     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1682       .addReg(CarryReg, RegState::Define)
1683       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1684       .add(OffsetLo)
1685       .addImm(0); // clamp bit
1686   (void)LoHalf;
1687   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1688 
1689   MachineInstr *HiHalf =
1690   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1691     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1692     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1693     .add(OffsetHi)
1694     .addReg(CarryReg, RegState::Kill)
1695     .addImm(0); // clamp bit
1696   (void)HiHalf;
1697   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1698 
1699   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1700   MachineInstr *FullBase =
1701     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1702       .addReg(DestSub0)
1703       .addImm(AMDGPU::sub0)
1704       .addReg(DestSub1)
1705       .addImm(AMDGPU::sub1);
1706   (void)FullBase;
1707   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1708 
1709   return FullDestReg;
1710 }
1711 
1712 // Update base and offset with the NewBase and NewOffset in MI.
1713 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1714                                                Register NewBase,
1715                                                int32_t NewOffset) const {
1716   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1717   Base->setReg(NewBase);
1718   Base->setIsKill(false);
1719   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1720 }
1721 
1722 Optional<int32_t>
1723 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1724   if (Op.isImm())
1725     return Op.getImm();
1726 
1727   if (!Op.isReg())
1728     return None;
1729 
1730   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1731   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1732       !Def->getOperand(1).isImm())
1733     return None;
1734 
1735   return Def->getOperand(1).getImm();
1736 }
1737 
1738 // Analyze Base and extracts:
1739 //  - 32bit base registers, subregisters
1740 //  - 64bit constant offset
1741 // Expecting base computation as:
1742 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1743 //   %LO:vgpr_32, %c:sreg_64_xexec =
1744 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1745 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1746 //   %Base:vreg_64 =
1747 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1748 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1749                                                       MemAddress &Addr) const {
1750   if (!Base.isReg())
1751     return;
1752 
1753   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1754   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1755       || Def->getNumOperands() != 5)
1756     return;
1757 
1758   MachineOperand BaseLo = Def->getOperand(1);
1759   MachineOperand BaseHi = Def->getOperand(3);
1760   if (!BaseLo.isReg() || !BaseHi.isReg())
1761     return;
1762 
1763   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1764   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1765 
1766   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1767       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1768     return;
1769 
1770   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1771   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1772 
1773   auto Offset0P = extractConstOffset(*Src0);
1774   if (Offset0P)
1775     BaseLo = *Src1;
1776   else {
1777     if (!(Offset0P = extractConstOffset(*Src1)))
1778       return;
1779     BaseLo = *Src0;
1780   }
1781 
1782   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1783   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1784 
1785   if (Src0->isImm())
1786     std::swap(Src0, Src1);
1787 
1788   if (!Src1->isImm())
1789     return;
1790 
1791   uint64_t Offset1 = Src1->getImm();
1792   BaseHi = *Src0;
1793 
1794   Addr.Base.LoReg = BaseLo.getReg();
1795   Addr.Base.HiReg = BaseHi.getReg();
1796   Addr.Base.LoSubReg = BaseLo.getSubReg();
1797   Addr.Base.HiSubReg = BaseHi.getSubReg();
1798   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1799 }
1800 
1801 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1802     MachineInstr &MI,
1803     MemInfoMap &Visited,
1804     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1805 
1806   if (!(MI.mayLoad() ^ MI.mayStore()))
1807     return false;
1808 
1809   // TODO: Support flat and scratch.
1810   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1811     return false;
1812 
1813   if (MI.mayLoad() &&
1814       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1815     return false;
1816 
1817   if (AnchorList.count(&MI))
1818     return false;
1819 
1820   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1821 
1822   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1823     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1824     return false;
1825   }
1826 
1827   // Step1: Find the base-registers and a 64bit constant offset.
1828   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1829   MemAddress MAddr;
1830   if (Visited.find(&MI) == Visited.end()) {
1831     processBaseWithConstOffset(Base, MAddr);
1832     Visited[&MI] = MAddr;
1833   } else
1834     MAddr = Visited[&MI];
1835 
1836   if (MAddr.Offset == 0) {
1837     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1838                          " constant offsets that can be promoted.\n";);
1839     return false;
1840   }
1841 
1842   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1843              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1844 
1845   // Step2: Traverse through MI's basic block and find an anchor(that has the
1846   // same base-registers) with the highest 13bit distance from MI's offset.
1847   // E.g. (64bit loads)
1848   // bb:
1849   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1850   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1851   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1852   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1853   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1854   //
1855   // Starting from the first load, the optimization will try to find a new base
1856   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1857   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1858   // as the new-base(anchor) because of the maximum distance which can
1859   // accommodate more intermediate bases presumably.
1860   //
1861   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1862   // (&a + 8192) for load1, load2, load4.
1863   //   addr = &a + 8192
1864   //   load1 = load(addr,       -4096)
1865   //   load2 = load(addr,       -2048)
1866   //   load3 = load(addr,       0)
1867   //   load4 = load(addr,       2048)
1868   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1869   //
1870   MachineInstr *AnchorInst = nullptr;
1871   MemAddress AnchorAddr;
1872   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1873   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1874 
1875   MachineBasicBlock *MBB = MI.getParent();
1876   MachineBasicBlock::iterator E = MBB->end();
1877   MachineBasicBlock::iterator MBBI = MI.getIterator();
1878   ++MBBI;
1879   const SITargetLowering *TLI =
1880     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1881 
1882   for ( ; MBBI != E; ++MBBI) {
1883     MachineInstr &MINext = *MBBI;
1884     // TODO: Support finding an anchor(with same base) from store addresses or
1885     // any other load addresses where the opcodes are different.
1886     if (MINext.getOpcode() != MI.getOpcode() ||
1887         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1888       continue;
1889 
1890     const MachineOperand &BaseNext =
1891       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1892     MemAddress MAddrNext;
1893     if (Visited.find(&MINext) == Visited.end()) {
1894       processBaseWithConstOffset(BaseNext, MAddrNext);
1895       Visited[&MINext] = MAddrNext;
1896     } else
1897       MAddrNext = Visited[&MINext];
1898 
1899     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1900         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1901         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1902         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1903       continue;
1904 
1905     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1906 
1907     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1908     TargetLoweringBase::AddrMode AM;
1909     AM.HasBaseReg = true;
1910     AM.BaseOffs = Dist;
1911     if (TLI->isLegalGlobalAddressingMode(AM) &&
1912         (uint32_t)std::abs(Dist) > MaxDist) {
1913       MaxDist = std::abs(Dist);
1914 
1915       AnchorAddr = MAddrNext;
1916       AnchorInst = &MINext;
1917     }
1918   }
1919 
1920   if (AnchorInst) {
1921     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1922                AnchorInst->dump());
1923     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1924                <<  AnchorAddr.Offset << "\n\n");
1925 
1926     // Instead of moving up, just re-compute anchor-instruction's base address.
1927     Register Base = computeBase(MI, AnchorAddr);
1928 
1929     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1930     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1931 
1932     for (auto P : InstsWCommonBase) {
1933       TargetLoweringBase::AddrMode AM;
1934       AM.HasBaseReg = true;
1935       AM.BaseOffs = P.second - AnchorAddr.Offset;
1936 
1937       if (TLI->isLegalGlobalAddressingMode(AM)) {
1938         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1939                    dbgs() << ")"; P.first->dump());
1940         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1941         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1942       }
1943     }
1944     AnchorList.insert(AnchorInst);
1945     return true;
1946   }
1947 
1948   return false;
1949 }
1950 
1951 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1952                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1953   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1954     if (AddrList.front().InstClass == CI.InstClass &&
1955         AddrList.front().IsAGPR == CI.IsAGPR &&
1956         AddrList.front().hasSameBaseAddress(*CI.I)) {
1957       AddrList.emplace_back(CI);
1958       return;
1959     }
1960   }
1961 
1962   // Base address not found, so add a new list.
1963   MergeableInsts.emplace_back(1, CI);
1964 }
1965 
1966 std::pair<MachineBasicBlock::iterator, bool>
1967 SILoadStoreOptimizer::collectMergeableInsts(
1968     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1969     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1970     std::list<std::list<CombineInfo>> &MergeableInsts) const {
1971   bool Modified = false;
1972 
1973   // Sort potential mergeable instructions into lists.  One list per base address.
1974   unsigned Order = 0;
1975   MachineBasicBlock::iterator BlockI = Begin;
1976   for (; BlockI != End; ++BlockI) {
1977     MachineInstr &MI = *BlockI;
1978 
1979     // We run this before checking if an address is mergeable, because it can produce
1980     // better code even if the instructions aren't mergeable.
1981     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1982       Modified = true;
1983 
1984     // Treat volatile accesses, ordered accesses and unmodeled side effects as
1985     // barriers. We can look after this barrier for separate merges.
1986     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
1987       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
1988 
1989       // Search will resume after this instruction in a separate merge list.
1990       ++BlockI;
1991       break;
1992     }
1993 
1994     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1995     if (InstClass == UNKNOWN)
1996       continue;
1997 
1998     // Do not merge VMEM buffer instructions with "swizzled" bit set.
1999     int Swizzled =
2000         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2001     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2002       continue;
2003 
2004     CombineInfo CI;
2005     CI.setMI(MI, *this);
2006     CI.Order = Order++;
2007 
2008     if (!CI.hasMergeableAddress(*MRI))
2009       continue;
2010 
2011     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2012       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2013       //        operands. However we are reporting that ds_write2 shall have
2014       //        only VGPR data so that machine copy propagation does not
2015       //        create an illegal instruction with a VGPR and AGPR sources.
2016       //        Consequenctially if we create such instruction the verifier
2017       //        will complain.
2018       continue;
2019     }
2020 
2021     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2022 
2023     addInstToMergeableList(CI, MergeableInsts);
2024   }
2025 
2026   // At this point we have lists of Mergeable instructions.
2027   //
2028   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2029   // list try to find an instruction that can be merged with I.  If an instruction
2030   // is found, it is stored in the Paired field.  If no instructions are found, then
2031   // the CombineInfo object is deleted from the list.
2032 
2033   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2034                                                    E = MergeableInsts.end(); I != E;) {
2035 
2036     std::list<CombineInfo> &MergeList = *I;
2037     if (MergeList.size() <= 1) {
2038       // This means we have found only one instruction with a given address
2039       // that can be merged, and we need at least 2 instructions to do a merge,
2040       // so this list can be discarded.
2041       I = MergeableInsts.erase(I);
2042       continue;
2043     }
2044 
2045     // Sort the lists by offsets, this way mergeable instructions will be
2046     // adjacent to each other in the list, which will make it easier to find
2047     // matches.
2048     MergeList.sort(
2049         [] (const CombineInfo &A, const CombineInfo &B) {
2050           return A.Offset < B.Offset;
2051         });
2052     ++I;
2053   }
2054 
2055   return std::make_pair(BlockI, Modified);
2056 }
2057 
2058 // Scan through looking for adjacent LDS operations with constant offsets from
2059 // the same base register. We rely on the scheduler to do the hard work of
2060 // clustering nearby loads, and assume these are all adjacent.
2061 bool SILoadStoreOptimizer::optimizeBlock(
2062                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2063   bool Modified = false;
2064 
2065   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2066                                                    E = MergeableInsts.end(); I != E;) {
2067     std::list<CombineInfo> &MergeList = *I;
2068 
2069     bool OptimizeListAgain = false;
2070     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2071       // We weren't able to make any changes, so delete the list so we don't
2072       // process the same instructions the next time we try to optimize this
2073       // block.
2074       I = MergeableInsts.erase(I);
2075       continue;
2076     }
2077 
2078     Modified = true;
2079 
2080     // We made changes, but also determined that there were no more optimization
2081     // opportunities, so we don't need to reprocess the list
2082     if (!OptimizeListAgain) {
2083       I = MergeableInsts.erase(I);
2084       continue;
2085     }
2086     OptimizeAgain = true;
2087   }
2088   return Modified;
2089 }
2090 
2091 bool
2092 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2093                                           std::list<CombineInfo> &MergeList,
2094                                           bool &OptimizeListAgain) {
2095   if (MergeList.empty())
2096     return false;
2097 
2098   bool Modified = false;
2099 
2100   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2101        Next = std::next(I)) {
2102 
2103     auto First = I;
2104     auto Second = Next;
2105 
2106     if ((*First).Order > (*Second).Order)
2107       std::swap(First, Second);
2108     CombineInfo &CI = *First;
2109     CombineInfo &Paired = *Second;
2110 
2111     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2112     if (!Where) {
2113       ++I;
2114       continue;
2115     }
2116 
2117     Modified = true;
2118 
2119     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2120 
2121     MachineBasicBlock::iterator NewMI;
2122     switch (CI.InstClass) {
2123     default:
2124       llvm_unreachable("unknown InstClass");
2125       break;
2126     case DS_READ:
2127       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2128       break;
2129     case DS_WRITE:
2130       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2131       break;
2132     case S_BUFFER_LOAD_IMM:
2133       NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2134       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2135       break;
2136     case BUFFER_LOAD:
2137       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2138       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2139       break;
2140     case BUFFER_STORE:
2141       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2142       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2143       break;
2144     case MIMG:
2145       NewMI = mergeImagePair(CI, Paired, Where->I);
2146       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2147       break;
2148     case TBUFFER_LOAD:
2149       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2150       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2151       break;
2152     case TBUFFER_STORE:
2153       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2154       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2155       break;
2156     case GLOBAL_LOAD:
2157     case GLOBAL_LOAD_SADDR:
2158       NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
2159       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2160       break;
2161     }
2162     CI.setMI(NewMI, *this);
2163     CI.Order = Where->Order;
2164     if (I == Second)
2165       I = Next;
2166 
2167     MergeList.erase(Second);
2168   }
2169 
2170   return Modified;
2171 }
2172 
2173 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2174   if (skipFunction(MF.getFunction()))
2175     return false;
2176 
2177   STM = &MF.getSubtarget<GCNSubtarget>();
2178   if (!STM->loadStoreOptEnabled())
2179     return false;
2180 
2181   TII = STM->getInstrInfo();
2182   TRI = &TII->getRegisterInfo();
2183 
2184   MRI = &MF.getRegInfo();
2185   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2186 
2187   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2188 
2189   bool Modified = false;
2190 
2191   // Contains the list of instructions for which constant offsets are being
2192   // promoted to the IMM. This is tracked for an entire block at time.
2193   SmallPtrSet<MachineInstr *, 4> AnchorList;
2194   MemInfoMap Visited;
2195 
2196   for (MachineBasicBlock &MBB : MF) {
2197     MachineBasicBlock::iterator SectionEnd;
2198     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2199          I = SectionEnd) {
2200       bool CollectModified;
2201       std::list<std::list<CombineInfo>> MergeableInsts;
2202 
2203       // First pass: Collect list of all instructions we know how to merge in a
2204       // subset of the block.
2205       std::tie(SectionEnd, CollectModified) =
2206           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2207 
2208       Modified |= CollectModified;
2209 
2210       do {
2211         OptimizeAgain = false;
2212         Modified |= optimizeBlock(MergeableInsts);
2213       } while (OptimizeAgain);
2214     }
2215 
2216     Visited.clear();
2217     AnchorList.clear();
2218   }
2219 
2220   return Modified;
2221 }
2222