1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This currently relies on the scheduler to place loads and stores next to
46 //   each other, and then only merges adjacent pairs of instructions. It would
47 //   be good to be more flexible with interleaved instructions, and possibly run
48 //   before scheduling. It currently missing stores of constants because loading
49 //   the constant into the data register is placed between the stores, although
50 //   this is arguably a scheduling problem.
51 //
52 // - Live interval recomputing seems inefficient. This currently only matches
53 //   one pair, and recomputes live intervals and moves on to the next pair. It
54 //   would be better to compute a list of all merges that need to occur.
55 //
56 // - With a list of instructions to process, we can also merge more. If a
57 //   cluster of loads have offsets that are too large to fit in the 8-bit
58 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
59 //   pointer and use the new reduced offsets.
60 //
61 //===----------------------------------------------------------------------===//
62 
63 #include "AMDGPU.h"
64 #include "AMDGPUSubtarget.h"
65 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/CodeGen/MachineBasicBlock.h"
74 #include "llvm/CodeGen/MachineFunction.h"
75 #include "llvm/CodeGen/MachineFunctionPass.h"
76 #include "llvm/CodeGen/MachineInstr.h"
77 #include "llvm/CodeGen/MachineInstrBuilder.h"
78 #include "llvm/CodeGen/MachineOperand.h"
79 #include "llvm/CodeGen/MachineRegisterInfo.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/MathExtras.h"
84 #include "llvm/Support/raw_ostream.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <cstdlib>
88 #include <iterator>
89 #include <utility>
90 
91 using namespace llvm;
92 
93 #define DEBUG_TYPE "si-load-store-opt"
94 
95 namespace {
96 enum InstClassEnum {
97   UNKNOWN,
98   DS_READ,
99   DS_WRITE,
100   S_BUFFER_LOAD_IMM,
101   BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102   BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103   BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104   BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105   BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106   BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107   BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108   BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109 };
110 
111 enum RegisterEnum {
112   SBASE = 0x1,
113   SRSRC = 0x2,
114   SOFFSET = 0x4,
115   VADDR = 0x8,
116   ADDR = 0x10,
117 };
118 
119 class SILoadStoreOptimizer : public MachineFunctionPass {
120   struct CombineInfo {
121     MachineBasicBlock::iterator I;
122     MachineBasicBlock::iterator Paired;
123     unsigned EltSize;
124     unsigned Offset0;
125     unsigned Offset1;
126     unsigned Width0;
127     unsigned Width1;
128     unsigned BaseOff;
129     InstClassEnum InstClass;
130     bool GLC0;
131     bool GLC1;
132     bool SLC0;
133     bool SLC1;
134     bool DLC0;
135     bool DLC1;
136     bool UseST64;
137     SmallVector<MachineInstr *, 8> InstsToMove;
138   };
139 
140   struct BaseRegisters {
141     unsigned LoReg = 0;
142     unsigned HiReg = 0;
143 
144     unsigned LoSubReg = 0;
145     unsigned HiSubReg = 0;
146   };
147 
148   struct MemAddress {
149     BaseRegisters Base;
150     int64_t Offset = 0;
151   };
152 
153   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
154 
155 private:
156   const GCNSubtarget *STM = nullptr;
157   const SIInstrInfo *TII = nullptr;
158   const SIRegisterInfo *TRI = nullptr;
159   MachineRegisterInfo *MRI = nullptr;
160   AliasAnalysis *AA = nullptr;
161   bool OptimizeAgain;
162 
163   static bool offsetsCanBeCombined(CombineInfo &CI);
164   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
165   static unsigned getNewOpcode(const CombineInfo &CI);
166   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
167   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
168   unsigned getOpcodeWidth(const MachineInstr &MI) const;
169   InstClassEnum getInstClass(unsigned Opc) const;
170   unsigned getRegs(unsigned Opc) const;
171 
172   bool findMatchingInst(CombineInfo &CI);
173 
174   unsigned read2Opcode(unsigned EltSize) const;
175   unsigned read2ST64Opcode(unsigned EltSize) const;
176   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
177 
178   unsigned write2Opcode(unsigned EltSize) const;
179   unsigned write2ST64Opcode(unsigned EltSize) const;
180   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
181   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
182   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
183   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
184 
185   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
186                            int32_t NewOffset);
187   unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
188   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
189   Optional<int32_t> extractConstOffset(const MachineOperand &Op);
190   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
191   /// Promotes constant offset to the immediate by adjusting the base. It
192   /// tries to use a base from the nearby instructions that allows it to have
193   /// a 13bit constant offset which gets promoted to the immediate.
194   bool promoteConstantOffsetToImm(MachineInstr &CI,
195                                   MemInfoMap &Visited,
196                                   SmallPtrSet<MachineInstr *, 4> &Promoted);
197 
198 public:
199   static char ID;
200 
201   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
202     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
203   }
204 
205   bool optimizeBlock(MachineBasicBlock &MBB);
206 
207   bool runOnMachineFunction(MachineFunction &MF) override;
208 
209   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
210 
211   void getAnalysisUsage(AnalysisUsage &AU) const override {
212     AU.setPreservesCFG();
213     AU.addRequired<AAResultsWrapperPass>();
214 
215     MachineFunctionPass::getAnalysisUsage(AU);
216   }
217 };
218 
219 } // end anonymous namespace.
220 
221 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
222                       "SI Load Store Optimizer", false, false)
223 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
224 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
225                     false, false)
226 
227 char SILoadStoreOptimizer::ID = 0;
228 
229 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
230 
231 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
232   return new SILoadStoreOptimizer();
233 }
234 
235 static void moveInstsAfter(MachineBasicBlock::iterator I,
236                            ArrayRef<MachineInstr *> InstsToMove) {
237   MachineBasicBlock *MBB = I->getParent();
238   ++I;
239   for (MachineInstr *MI : InstsToMove) {
240     MI->removeFromParent();
241     MBB->insert(I, MI);
242   }
243 }
244 
245 static void addDefsUsesToList(const MachineInstr &MI,
246                               DenseSet<unsigned> &RegDefs,
247                               DenseSet<unsigned> &PhysRegUses) {
248   for (const MachineOperand &Op : MI.operands()) {
249     if (Op.isReg()) {
250       if (Op.isDef())
251         RegDefs.insert(Op.getReg());
252       else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
253         PhysRegUses.insert(Op.getReg());
254     }
255   }
256 }
257 
258 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
259                                       MachineBasicBlock::iterator B,
260                                       AliasAnalysis *AA) {
261   // RAW or WAR - cannot reorder
262   // WAW - cannot reorder
263   // RAR - safe to reorder
264   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
265 }
266 
267 // Add MI and its defs to the lists if MI reads one of the defs that are
268 // already in the list. Returns true in that case.
269 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
270                                   DenseSet<unsigned> &PhysRegUses,
271                                   SmallVectorImpl<MachineInstr *> &Insts) {
272   for (MachineOperand &Use : MI.operands()) {
273     // If one of the defs is read, then there is a use of Def between I and the
274     // instruction that I will potentially be merged with. We will need to move
275     // this instruction after the merged instructions.
276     //
277     // Similarly, if there is a def which is read by an instruction that is to
278     // be moved for merging, then we need to move the def-instruction as well.
279     // This can only happen for physical registers such as M0; virtual
280     // registers are in SSA form.
281     if (Use.isReg() &&
282         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
283          (Use.isDef() && RegDefs.count(Use.getReg())) ||
284          (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
285           PhysRegUses.count(Use.getReg())))) {
286       Insts.push_back(&MI);
287       addDefsUsesToList(MI, RegDefs, PhysRegUses);
288       return true;
289     }
290   }
291 
292   return false;
293 }
294 
295 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
296                                     ArrayRef<MachineInstr *> InstsToMove,
297                                     AliasAnalysis *AA) {
298   assert(MemOp.mayLoadOrStore());
299 
300   for (MachineInstr *InstToMove : InstsToMove) {
301     if (!InstToMove->mayLoadOrStore())
302       continue;
303     if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
304       return false;
305   }
306   return true;
307 }
308 
309 // This function assumes that \p A and \p B have are identical except for
310 // size and offset, and they referecne adjacent memory.
311 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
312                                                    const MachineMemOperand *A,
313                                                    const MachineMemOperand *B) {
314   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
315   unsigned Size = A->getSize() + B->getSize();
316   return MF.getMachineMemOperand(A, MinOffset, Size);
317 }
318 
319 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
320   // XXX - Would the same offset be OK? Is there any reason this would happen or
321   // be useful?
322   if (CI.Offset0 == CI.Offset1)
323     return false;
324 
325   // This won't be valid if the offset isn't aligned.
326   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
327     return false;
328 
329   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
330   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
331   CI.UseST64 = false;
332   CI.BaseOff = 0;
333 
334   // Handle SMEM and VMEM instructions.
335   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
336     return (EltOffset0 + CI.Width0 == EltOffset1 ||
337             EltOffset1 + CI.Width1 == EltOffset0) &&
338            CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
339            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
340   }
341 
342   // If the offset in elements doesn't fit in 8-bits, we might be able to use
343   // the stride 64 versions.
344   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
345       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
346     CI.Offset0 = EltOffset0 / 64;
347     CI.Offset1 = EltOffset1 / 64;
348     CI.UseST64 = true;
349     return true;
350   }
351 
352   // Check if the new offsets fit in the reduced 8-bit range.
353   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
354     CI.Offset0 = EltOffset0;
355     CI.Offset1 = EltOffset1;
356     return true;
357   }
358 
359   // Try to shift base address to decrease offsets.
360   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
361   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
362 
363   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
364     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
365     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
366     CI.UseST64 = true;
367     return true;
368   }
369 
370   if (isUInt<8>(OffsetDiff)) {
371     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
372     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
373     return true;
374   }
375 
376   return false;
377 }
378 
379 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
380                                      const CombineInfo &CI) {
381   const unsigned Width = (CI.Width0 + CI.Width1);
382   switch (CI.InstClass) {
383   default:
384     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
385   case S_BUFFER_LOAD_IMM:
386     switch (Width) {
387     default:
388       return false;
389     case 2:
390     case 4:
391       return true;
392     }
393   }
394 }
395 
396 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) const {
397   const unsigned Opc = MI.getOpcode();
398 
399   if (TII->isMUBUF(MI)) {
400     return AMDGPU::getMUBUFDwords(Opc);
401   }
402 
403   switch (Opc) {
404   default:
405     return 0;
406   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
407     return 1;
408   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
409     return 2;
410   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
411     return 4;
412   }
413 }
414 
415 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) const {
416   if (TII->isMUBUF(Opc)) {
417     const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
418 
419     // If we couldn't identify the opcode, bail out.
420     if (baseOpcode == -1) {
421       return UNKNOWN;
422     }
423 
424     switch (baseOpcode) {
425     default:
426       return UNKNOWN;
427     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
428       return BUFFER_LOAD_OFFEN;
429     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
430       return BUFFER_LOAD_OFFSET;
431     case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
432       return BUFFER_STORE_OFFEN;
433     case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
434       return BUFFER_STORE_OFFSET;
435     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
436       return BUFFER_LOAD_OFFEN_exact;
437     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
438       return BUFFER_LOAD_OFFSET_exact;
439     case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
440       return BUFFER_STORE_OFFEN_exact;
441     case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
442       return BUFFER_STORE_OFFSET_exact;
443     }
444   }
445 
446   switch (Opc) {
447   default:
448     return UNKNOWN;
449   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
450   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
451   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
452     return S_BUFFER_LOAD_IMM;
453   case AMDGPU::DS_READ_B32:
454   case AMDGPU::DS_READ_B64:
455   case AMDGPU::DS_READ_B32_gfx9:
456   case AMDGPU::DS_READ_B64_gfx9:
457     return DS_READ;
458   case AMDGPU::DS_WRITE_B32:
459   case AMDGPU::DS_WRITE_B64:
460   case AMDGPU::DS_WRITE_B32_gfx9:
461   case AMDGPU::DS_WRITE_B64_gfx9:
462     return DS_WRITE;
463   }
464 }
465 
466 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) const {
467   if (TII->isMUBUF(Opc)) {
468     unsigned result = 0;
469 
470     if (AMDGPU::getMUBUFHasVAddr(Opc)) {
471       result |= VADDR;
472     }
473 
474     if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
475       result |= SRSRC;
476     }
477 
478     if (AMDGPU::getMUBUFHasSoffset(Opc)) {
479       result |= SOFFSET;
480     }
481 
482     return result;
483   }
484 
485   switch (Opc) {
486   default:
487     return 0;
488   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
489   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
490   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
491     return SBASE;
492   case AMDGPU::DS_READ_B32:
493   case AMDGPU::DS_READ_B64:
494   case AMDGPU::DS_READ_B32_gfx9:
495   case AMDGPU::DS_READ_B64_gfx9:
496   case AMDGPU::DS_WRITE_B32:
497   case AMDGPU::DS_WRITE_B64:
498   case AMDGPU::DS_WRITE_B32_gfx9:
499   case AMDGPU::DS_WRITE_B64_gfx9:
500     return ADDR;
501   }
502 }
503 
504 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
505   MachineBasicBlock *MBB = CI.I->getParent();
506   MachineBasicBlock::iterator E = MBB->end();
507   MachineBasicBlock::iterator MBBI = CI.I;
508 
509   const unsigned Opc = CI.I->getOpcode();
510   const InstClassEnum InstClass = getInstClass(Opc);
511 
512   if (InstClass == UNKNOWN) {
513     return false;
514   }
515 
516   const unsigned Regs = getRegs(Opc);
517 
518   unsigned AddrOpName[5] = {0};
519   int AddrIdx[5];
520   const MachineOperand *AddrReg[5];
521   unsigned NumAddresses = 0;
522 
523   if (Regs & ADDR) {
524     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
525   }
526 
527   if (Regs & SBASE) {
528     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
529   }
530 
531   if (Regs & SRSRC) {
532     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
533   }
534 
535   if (Regs & SOFFSET) {
536     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
537   }
538 
539   if (Regs & VADDR) {
540     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
541   }
542 
543   for (unsigned i = 0; i < NumAddresses; i++) {
544     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
545     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
546 
547     // We only ever merge operations with the same base address register, so
548     // don't bother scanning forward if there are no other uses.
549     if (AddrReg[i]->isReg() &&
550         (Register::isPhysicalRegister(AddrReg[i]->getReg()) ||
551          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
552       return false;
553   }
554 
555   ++MBBI;
556 
557   DenseSet<unsigned> RegDefsToMove;
558   DenseSet<unsigned> PhysRegUsesToMove;
559   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
560 
561   for (; MBBI != E; ++MBBI) {
562     const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
563 
564     if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
565         (IsDS && (MBBI->getOpcode() != Opc))) {
566       // This is not a matching DS instruction, but we can keep looking as
567       // long as one of these conditions are met:
568       // 1. It is safe to move I down past MBBI.
569       // 2. It is safe to move MBBI down past the instruction that I will
570       //    be merged into.
571 
572       if (MBBI->hasUnmodeledSideEffects()) {
573         // We can't re-order this instruction with respect to other memory
574         // operations, so we fail both conditions mentioned above.
575         return false;
576       }
577 
578       if (MBBI->mayLoadOrStore() &&
579           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
580            !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
581         // We fail condition #1, but we may still be able to satisfy condition
582         // #2.  Add this instruction to the move list and then we will check
583         // if condition #2 holds once we have selected the matching instruction.
584         CI.InstsToMove.push_back(&*MBBI);
585         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
586         continue;
587       }
588 
589       // When we match I with another DS instruction we will be moving I down
590       // to the location of the matched instruction any uses of I will need to
591       // be moved down as well.
592       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
593                             CI.InstsToMove);
594       continue;
595     }
596 
597     // Don't merge volatiles.
598     if (MBBI->hasOrderedMemoryRef())
599       return false;
600 
601     // Handle a case like
602     //   DS_WRITE_B32 addr, v, idx0
603     //   w = DS_READ_B32 addr, idx0
604     //   DS_WRITE_B32 addr, f(w), idx1
605     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
606     // merging of the two writes.
607     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
608                               CI.InstsToMove))
609       continue;
610 
611     bool Match = true;
612     for (unsigned i = 0; i < NumAddresses; i++) {
613       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
614 
615       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
616         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
617             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
618           Match = false;
619           break;
620         }
621         continue;
622       }
623 
624       // Check same base pointer. Be careful of subregisters, which can occur
625       // with vectors of pointers.
626       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
627           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
628         Match = false;
629         break;
630       }
631     }
632 
633     if (Match) {
634       int OffsetIdx =
635           AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
636       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
637       CI.Width0 = getOpcodeWidth(*CI.I);
638       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
639       CI.Width1 = getOpcodeWidth(*MBBI);
640       CI.Paired = MBBI;
641 
642       if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
643         CI.Offset0 &= 0xffff;
644         CI.Offset1 &= 0xffff;
645       } else {
646         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
647         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
648         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
649           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
650           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
651         }
652         CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
653         CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
654       }
655 
656       // Check both offsets fit in the reduced range.
657       // We also need to go through the list of instructions that we plan to
658       // move and make sure they are all safe to move down past the merged
659       // instruction.
660       if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
661         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
662           return true;
663     }
664 
665     // We've found a load/store that we couldn't merge for some reason.
666     // We could potentially keep looking, but we'd need to make sure that
667     // it was safe to move I and also all the instruction in InstsToMove
668     // down past this instruction.
669     // check if we can move I across MBBI and if we can move all I's users
670     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
671         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
672       break;
673   }
674   return false;
675 }
676 
677 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
678   if (STM->ldsRequiresM0Init())
679     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
680   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
681 }
682 
683 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
684   if (STM->ldsRequiresM0Init())
685     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
686 
687   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
688                         : AMDGPU::DS_READ2ST64_B64_gfx9;
689 }
690 
691 MachineBasicBlock::iterator
692 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
693   MachineBasicBlock *MBB = CI.I->getParent();
694 
695   // Be careful, since the addresses could be subregisters themselves in weird
696   // cases, like vectors of pointers.
697   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
698 
699   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
700   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
701 
702   unsigned NewOffset0 = CI.Offset0;
703   unsigned NewOffset1 = CI.Offset1;
704   unsigned Opc =
705       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
706 
707   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
708   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
709 
710   if (NewOffset0 > NewOffset1) {
711     // Canonicalize the merged instruction so the smaller offset comes first.
712     std::swap(NewOffset0, NewOffset1);
713     std::swap(SubRegIdx0, SubRegIdx1);
714   }
715 
716   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
717          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
718 
719   const MCInstrDesc &Read2Desc = TII->get(Opc);
720 
721   const TargetRegisterClass *SuperRC =
722       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
723   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
724 
725   DebugLoc DL = CI.I->getDebugLoc();
726 
727   unsigned BaseReg = AddrReg->getReg();
728   unsigned BaseSubReg = AddrReg->getSubReg();
729   unsigned BaseRegFlags = 0;
730   if (CI.BaseOff) {
731     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
732     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
733         .addImm(CI.BaseOff);
734 
735     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
736     BaseRegFlags = RegState::Kill;
737 
738     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
739         .addReg(ImmReg)
740         .addReg(AddrReg->getReg(), 0, BaseSubReg)
741         .addImm(0); // clamp bit
742     BaseSubReg = 0;
743   }
744 
745   MachineInstrBuilder Read2 =
746       BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
747           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
748           .addImm(NewOffset0)                        // offset0
749           .addImm(NewOffset1)                        // offset1
750           .addImm(0)                                 // gds
751           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
752 
753   (void)Read2;
754 
755   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
756 
757   // Copy to the old destination registers.
758   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
759       .add(*Dest0) // Copy to same destination including flags and sub reg.
760       .addReg(DestReg, 0, SubRegIdx0);
761   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
762                             .add(*Dest1)
763                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
764 
765   moveInstsAfter(Copy1, CI.InstsToMove);
766 
767   MachineBasicBlock::iterator Next = std::next(CI.I);
768   CI.I->eraseFromParent();
769   CI.Paired->eraseFromParent();
770 
771   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
772   return Next;
773 }
774 
775 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
776   if (STM->ldsRequiresM0Init())
777     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
778   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
779                         : AMDGPU::DS_WRITE2_B64_gfx9;
780 }
781 
782 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
783   if (STM->ldsRequiresM0Init())
784     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
785                           : AMDGPU::DS_WRITE2ST64_B64;
786 
787   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
788                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
789 }
790 
791 MachineBasicBlock::iterator
792 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
793   MachineBasicBlock *MBB = CI.I->getParent();
794 
795   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
796   // sure we preserve the subregister index and any register flags set on them.
797   const MachineOperand *AddrReg =
798       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
799   const MachineOperand *Data0 =
800       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
801   const MachineOperand *Data1 =
802       TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
803 
804   unsigned NewOffset0 = CI.Offset0;
805   unsigned NewOffset1 = CI.Offset1;
806   unsigned Opc =
807       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
808 
809   if (NewOffset0 > NewOffset1) {
810     // Canonicalize the merged instruction so the smaller offset comes first.
811     std::swap(NewOffset0, NewOffset1);
812     std::swap(Data0, Data1);
813   }
814 
815   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
816          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
817 
818   const MCInstrDesc &Write2Desc = TII->get(Opc);
819   DebugLoc DL = CI.I->getDebugLoc();
820 
821   unsigned BaseReg = AddrReg->getReg();
822   unsigned BaseSubReg = AddrReg->getSubReg();
823   unsigned BaseRegFlags = 0;
824   if (CI.BaseOff) {
825     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
826     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
827         .addImm(CI.BaseOff);
828 
829     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
830     BaseRegFlags = RegState::Kill;
831 
832     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
833         .addReg(ImmReg)
834         .addReg(AddrReg->getReg(), 0, BaseSubReg)
835         .addImm(0); // clamp bit
836     BaseSubReg = 0;
837   }
838 
839   MachineInstrBuilder Write2 =
840       BuildMI(*MBB, CI.Paired, DL, Write2Desc)
841           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
842           .add(*Data0)                               // data0
843           .add(*Data1)                               // data1
844           .addImm(NewOffset0)                        // offset0
845           .addImm(NewOffset1)                        // offset1
846           .addImm(0)                                 // gds
847           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
848 
849   moveInstsAfter(Write2, CI.InstsToMove);
850 
851   MachineBasicBlock::iterator Next = std::next(CI.I);
852   CI.I->eraseFromParent();
853   CI.Paired->eraseFromParent();
854 
855   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
856   return Next;
857 }
858 
859 MachineBasicBlock::iterator
860 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
861   MachineBasicBlock *MBB = CI.I->getParent();
862   DebugLoc DL = CI.I->getDebugLoc();
863   const unsigned Opcode = getNewOpcode(CI);
864 
865   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
866 
867   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
868   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
869 
870   // It shouldn't be possible to get this far if the two instructions
871   // don't have a single memoperand, because MachineInstr::mayAlias()
872   // will return true if this is the case.
873   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
874 
875   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
876   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
877 
878   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
879       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
880       .addImm(MergedOffset) // offset
881       .addImm(CI.GLC0)      // glc
882       .addImm(CI.DLC0)      // dlc
883       .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
884 
885   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
886   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
887   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
888 
889   // Copy to the old destination registers.
890   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
891   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
892   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
893 
894   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
895       .add(*Dest0) // Copy to same destination including flags and sub reg.
896       .addReg(DestReg, 0, SubRegIdx0);
897   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
898                             .add(*Dest1)
899                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
900 
901   moveInstsAfter(Copy1, CI.InstsToMove);
902 
903   MachineBasicBlock::iterator Next = std::next(CI.I);
904   CI.I->eraseFromParent();
905   CI.Paired->eraseFromParent();
906   return Next;
907 }
908 
909 MachineBasicBlock::iterator
910 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
911   MachineBasicBlock *MBB = CI.I->getParent();
912   DebugLoc DL = CI.I->getDebugLoc();
913 
914   const unsigned Opcode = getNewOpcode(CI);
915 
916   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
917 
918   // Copy to the new source register.
919   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
920   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
921 
922   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
923 
924   const unsigned Regs = getRegs(Opcode);
925 
926   if (Regs & VADDR)
927     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
928 
929   // It shouldn't be possible to get this far if the two instructions
930   // don't have a single memoperand, because MachineInstr::mayAlias()
931   // will return true if this is the case.
932   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
933 
934   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
935   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
936 
937   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
938       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
939       .addImm(MergedOffset) // offset
940       .addImm(CI.GLC0)      // glc
941       .addImm(CI.SLC0)      // slc
942       .addImm(0)            // tfe
943       .addImm(CI.DLC0)      // dlc
944       .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
945 
946   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
947   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
948   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
949 
950   // Copy to the old destination registers.
951   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
952   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
953   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
954 
955   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
956       .add(*Dest0) // Copy to same destination including flags and sub reg.
957       .addReg(DestReg, 0, SubRegIdx0);
958   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
959                             .add(*Dest1)
960                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
961 
962   moveInstsAfter(Copy1, CI.InstsToMove);
963 
964   MachineBasicBlock::iterator Next = std::next(CI.I);
965   CI.I->eraseFromParent();
966   CI.Paired->eraseFromParent();
967   return Next;
968 }
969 
970 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
971   const unsigned Width = CI.Width0 + CI.Width1;
972 
973   switch (CI.InstClass) {
974   default:
975     return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
976   case UNKNOWN:
977     llvm_unreachable("Unknown instruction class");
978   case S_BUFFER_LOAD_IMM:
979     switch (Width) {
980     default:
981       return 0;
982     case 2:
983       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
984     case 4:
985       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
986     }
987   }
988 }
989 
990 std::pair<unsigned, unsigned>
991 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
992   if (CI.Offset0 > CI.Offset1) {
993     switch (CI.Width0) {
994     default:
995       return std::make_pair(0, 0);
996     case 1:
997       switch (CI.Width1) {
998       default:
999         return std::make_pair(0, 0);
1000       case 1:
1001         return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
1002       case 2:
1003         return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
1004       case 3:
1005         return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
1006       }
1007     case 2:
1008       switch (CI.Width1) {
1009       default:
1010         return std::make_pair(0, 0);
1011       case 1:
1012         return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
1013       case 2:
1014         return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
1015       }
1016     case 3:
1017       switch (CI.Width1) {
1018       default:
1019         return std::make_pair(0, 0);
1020       case 1:
1021         return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
1022       }
1023     }
1024   } else {
1025     switch (CI.Width0) {
1026     default:
1027       return std::make_pair(0, 0);
1028     case 1:
1029       switch (CI.Width1) {
1030       default:
1031         return std::make_pair(0, 0);
1032       case 1:
1033         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1034       case 2:
1035         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1036       case 3:
1037         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1038       }
1039     case 2:
1040       switch (CI.Width1) {
1041       default:
1042         return std::make_pair(0, 0);
1043       case 1:
1044         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1045       case 2:
1046         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1047       }
1048     case 3:
1049       switch (CI.Width1) {
1050       default:
1051         return std::make_pair(0, 0);
1052       case 1:
1053         return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1054       }
1055     }
1056   }
1057 }
1058 
1059 const TargetRegisterClass *
1060 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1061   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1062     switch (CI.Width0 + CI.Width1) {
1063     default:
1064       return nullptr;
1065     case 2:
1066       return &AMDGPU::SReg_64_XEXECRegClass;
1067     case 4:
1068       return &AMDGPU::SReg_128RegClass;
1069     case 8:
1070       return &AMDGPU::SReg_256RegClass;
1071     case 16:
1072       return &AMDGPU::SReg_512RegClass;
1073     }
1074   } else {
1075     switch (CI.Width0 + CI.Width1) {
1076     default:
1077       return nullptr;
1078     case 2:
1079       return &AMDGPU::VReg_64RegClass;
1080     case 3:
1081       return &AMDGPU::VReg_96RegClass;
1082     case 4:
1083       return &AMDGPU::VReg_128RegClass;
1084     }
1085   }
1086 }
1087 
1088 MachineBasicBlock::iterator
1089 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1090   MachineBasicBlock *MBB = CI.I->getParent();
1091   DebugLoc DL = CI.I->getDebugLoc();
1092 
1093   const unsigned Opcode = getNewOpcode(CI);
1094 
1095   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1096   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1097   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1098 
1099   // Copy to the new source register.
1100   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1101   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1102 
1103   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1104   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1105 
1106   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1107       .add(*Src0)
1108       .addImm(SubRegIdx0)
1109       .add(*Src1)
1110       .addImm(SubRegIdx1);
1111 
1112   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1113                  .addReg(SrcReg, RegState::Kill);
1114 
1115   const unsigned Regs = getRegs(Opcode);
1116 
1117   if (Regs & VADDR)
1118     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1119 
1120 
1121   // It shouldn't be possible to get this far if the two instructions
1122   // don't have a single memoperand, because MachineInstr::mayAlias()
1123   // will return true if this is the case.
1124   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1125 
1126   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1127   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1128 
1129   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1130       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1131       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1132       .addImm(CI.GLC0)      // glc
1133       .addImm(CI.SLC0)      // slc
1134       .addImm(0)            // tfe
1135       .addImm(CI.DLC0)      // dlc
1136       .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1137 
1138   moveInstsAfter(MIB, CI.InstsToMove);
1139 
1140   MachineBasicBlock::iterator Next = std::next(CI.I);
1141   CI.I->eraseFromParent();
1142   CI.Paired->eraseFromParent();
1143   return Next;
1144 }
1145 
1146 MachineOperand
1147 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1148   APInt V(32, Val, true);
1149   if (TII->isInlineConstant(V))
1150     return MachineOperand::CreateImm(Val);
1151 
1152   unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1153   MachineInstr *Mov =
1154   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1155           TII->get(AMDGPU::S_MOV_B32), Reg)
1156     .addImm(Val);
1157   (void)Mov;
1158   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1159   return MachineOperand::CreateReg(Reg, false);
1160 }
1161 
1162 // Compute base address using Addr and return the final register.
1163 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1164                                            const MemAddress &Addr) {
1165   MachineBasicBlock *MBB = MI.getParent();
1166   MachineBasicBlock::iterator MBBI = MI.getIterator();
1167   DebugLoc DL = MI.getDebugLoc();
1168 
1169   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1170           Addr.Base.LoSubReg) &&
1171          "Expected 32-bit Base-Register-Low!!");
1172 
1173   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1174           Addr.Base.HiSubReg) &&
1175          "Expected 32-bit Base-Register-Hi!!");
1176 
1177   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1178   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1179   MachineOperand OffsetHi =
1180     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1181 
1182   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1183   unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
1184   unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1185 
1186   unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1187   unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1188   MachineInstr *LoHalf =
1189     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1190       .addReg(CarryReg, RegState::Define)
1191       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1192       .add(OffsetLo)
1193       .addImm(0); // clamp bit
1194   (void)LoHalf;
1195   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1196 
1197   MachineInstr *HiHalf =
1198   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1199     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1200     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1201     .add(OffsetHi)
1202     .addReg(CarryReg, RegState::Kill)
1203     .addImm(0); // clamp bit
1204   (void)HiHalf;
1205   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1206 
1207   unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1208   MachineInstr *FullBase =
1209     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1210       .addReg(DestSub0)
1211       .addImm(AMDGPU::sub0)
1212       .addReg(DestSub1)
1213       .addImm(AMDGPU::sub1);
1214   (void)FullBase;
1215   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1216 
1217   return FullDestReg;
1218 }
1219 
1220 // Update base and offset with the NewBase and NewOffset in MI.
1221 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1222                                                unsigned NewBase,
1223                                                int32_t NewOffset) {
1224   TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1225   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1226 }
1227 
1228 Optional<int32_t>
1229 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1230   if (Op.isImm())
1231     return Op.getImm();
1232 
1233   if (!Op.isReg())
1234     return None;
1235 
1236   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1237   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1238       !Def->getOperand(1).isImm())
1239     return None;
1240 
1241   return Def->getOperand(1).getImm();
1242 }
1243 
1244 // Analyze Base and extracts:
1245 //  - 32bit base registers, subregisters
1246 //  - 64bit constant offset
1247 // Expecting base computation as:
1248 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1249 //   %LO:vgpr_32, %c:sreg_64_xexec =
1250 //       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1251 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1252 //   %Base:vreg_64 =
1253 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1254 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1255                                                       MemAddress &Addr) {
1256   if (!Base.isReg())
1257     return;
1258 
1259   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1260   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1261       || Def->getNumOperands() != 5)
1262     return;
1263 
1264   MachineOperand BaseLo = Def->getOperand(1);
1265   MachineOperand BaseHi = Def->getOperand(3);
1266   if (!BaseLo.isReg() || !BaseHi.isReg())
1267     return;
1268 
1269   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1270   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1271 
1272   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1273       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1274     return;
1275 
1276   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1277   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1278 
1279   auto Offset0P = extractConstOffset(*Src0);
1280   if (Offset0P)
1281     BaseLo = *Src1;
1282   else {
1283     if (!(Offset0P = extractConstOffset(*Src1)))
1284       return;
1285     BaseLo = *Src0;
1286   }
1287 
1288   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1289   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1290 
1291   if (Src0->isImm())
1292     std::swap(Src0, Src1);
1293 
1294   if (!Src1->isImm())
1295     return;
1296 
1297   uint64_t Offset1 = Src1->getImm();
1298   BaseHi = *Src0;
1299 
1300   Addr.Base.LoReg = BaseLo.getReg();
1301   Addr.Base.HiReg = BaseHi.getReg();
1302   Addr.Base.LoSubReg = BaseLo.getSubReg();
1303   Addr.Base.HiSubReg = BaseHi.getSubReg();
1304   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1305 }
1306 
1307 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1308     MachineInstr &MI,
1309     MemInfoMap &Visited,
1310     SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1311 
1312   // TODO: Support flat and scratch.
1313   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1314       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1315     return false;
1316 
1317   // TODO: Support Store.
1318   if (!MI.mayLoad())
1319     return false;
1320 
1321   if (AnchorList.count(&MI))
1322     return false;
1323 
1324   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1325 
1326   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1327     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1328     return false;
1329   }
1330 
1331   // Step1: Find the base-registers and a 64bit constant offset.
1332   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1333   MemAddress MAddr;
1334   if (Visited.find(&MI) == Visited.end()) {
1335     processBaseWithConstOffset(Base, MAddr);
1336     Visited[&MI] = MAddr;
1337   } else
1338     MAddr = Visited[&MI];
1339 
1340   if (MAddr.Offset == 0) {
1341     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1342                          " constant offsets that can be promoted.\n";);
1343     return false;
1344   }
1345 
1346   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1347              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1348 
1349   // Step2: Traverse through MI's basic block and find an anchor(that has the
1350   // same base-registers) with the highest 13bit distance from MI's offset.
1351   // E.g. (64bit loads)
1352   // bb:
1353   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1354   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1355   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1356   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1357   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1358   //
1359   // Starting from the first load, the optimization will try to find a new base
1360   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1361   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1362   // as the new-base(anchor) because of the maximum distance which can
1363   // accomodate more intermediate bases presumeably.
1364   //
1365   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1366   // (&a + 8192) for load1, load2, load4.
1367   //   addr = &a + 8192
1368   //   load1 = load(addr,       -4096)
1369   //   load2 = load(addr,       -2048)
1370   //   load3 = load(addr,       0)
1371   //   load4 = load(addr,       2048)
1372   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1373   //
1374   MachineInstr *AnchorInst = nullptr;
1375   MemAddress AnchorAddr;
1376   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1377   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1378 
1379   MachineBasicBlock *MBB = MI.getParent();
1380   MachineBasicBlock::iterator E = MBB->end();
1381   MachineBasicBlock::iterator MBBI = MI.getIterator();
1382   ++MBBI;
1383   const SITargetLowering *TLI =
1384     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1385 
1386   for ( ; MBBI != E; ++MBBI) {
1387     MachineInstr &MINext = *MBBI;
1388     // TODO: Support finding an anchor(with same base) from store addresses or
1389     // any other load addresses where the opcodes are different.
1390     if (MINext.getOpcode() != MI.getOpcode() ||
1391         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1392       continue;
1393 
1394     const MachineOperand &BaseNext =
1395       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1396     MemAddress MAddrNext;
1397     if (Visited.find(&MINext) == Visited.end()) {
1398       processBaseWithConstOffset(BaseNext, MAddrNext);
1399       Visited[&MINext] = MAddrNext;
1400     } else
1401       MAddrNext = Visited[&MINext];
1402 
1403     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1404         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1405         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1406         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1407       continue;
1408 
1409     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1410 
1411     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1412     TargetLoweringBase::AddrMode AM;
1413     AM.HasBaseReg = true;
1414     AM.BaseOffs = Dist;
1415     if (TLI->isLegalGlobalAddressingMode(AM) &&
1416         (uint32_t)std::abs(Dist) > MaxDist) {
1417       MaxDist = std::abs(Dist);
1418 
1419       AnchorAddr = MAddrNext;
1420       AnchorInst = &MINext;
1421     }
1422   }
1423 
1424   if (AnchorInst) {
1425     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1426                AnchorInst->dump());
1427     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1428                <<  AnchorAddr.Offset << "\n\n");
1429 
1430     // Instead of moving up, just re-compute anchor-instruction's base address.
1431     unsigned Base = computeBase(MI, AnchorAddr);
1432 
1433     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1434     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1435 
1436     for (auto P : InstsWCommonBase) {
1437       TargetLoweringBase::AddrMode AM;
1438       AM.HasBaseReg = true;
1439       AM.BaseOffs = P.second - AnchorAddr.Offset;
1440 
1441       if (TLI->isLegalGlobalAddressingMode(AM)) {
1442         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1443                    dbgs() << ")"; P.first->dump());
1444         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1445         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1446       }
1447     }
1448     AnchorList.insert(AnchorInst);
1449     return true;
1450   }
1451 
1452   return false;
1453 }
1454 
1455 // Scan through looking for adjacent LDS operations with constant offsets from
1456 // the same base register. We rely on the scheduler to do the hard work of
1457 // clustering nearby loads, and assume these are all adjacent.
1458 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1459   bool Modified = false;
1460 
1461   // Contain the list
1462   MemInfoMap Visited;
1463   // Contains the list of instructions for which constant offsets are being
1464   // promoted to the IMM.
1465   SmallPtrSet<MachineInstr *, 4> AnchorList;
1466 
1467   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1468     MachineInstr &MI = *I;
1469 
1470     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1471       Modified = true;
1472 
1473     // Don't combine if volatile.
1474     if (MI.hasOrderedMemoryRef()) {
1475       ++I;
1476       continue;
1477     }
1478 
1479     const unsigned Opc = MI.getOpcode();
1480 
1481     CombineInfo CI;
1482     CI.I = I;
1483     CI.InstClass = getInstClass(Opc);
1484 
1485     switch (CI.InstClass) {
1486     default:
1487       break;
1488     case DS_READ:
1489       CI.EltSize =
1490           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1491                                                                           : 4;
1492       if (findMatchingInst(CI)) {
1493         Modified = true;
1494         I = mergeRead2Pair(CI);
1495       } else {
1496         ++I;
1497       }
1498       continue;
1499     case DS_WRITE:
1500       CI.EltSize =
1501           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1502                                                                             : 4;
1503       if (findMatchingInst(CI)) {
1504         Modified = true;
1505         I = mergeWrite2Pair(CI);
1506       } else {
1507         ++I;
1508       }
1509       continue;
1510     case S_BUFFER_LOAD_IMM:
1511       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1512       if (findMatchingInst(CI)) {
1513         Modified = true;
1514         I = mergeSBufferLoadImmPair(CI);
1515         OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1516       } else {
1517         ++I;
1518       }
1519       continue;
1520     case BUFFER_LOAD_OFFEN:
1521     case BUFFER_LOAD_OFFSET:
1522     case BUFFER_LOAD_OFFEN_exact:
1523     case BUFFER_LOAD_OFFSET_exact:
1524       CI.EltSize = 4;
1525       if (findMatchingInst(CI)) {
1526         Modified = true;
1527         I = mergeBufferLoadPair(CI);
1528         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1529       } else {
1530         ++I;
1531       }
1532       continue;
1533     case BUFFER_STORE_OFFEN:
1534     case BUFFER_STORE_OFFSET:
1535     case BUFFER_STORE_OFFEN_exact:
1536     case BUFFER_STORE_OFFSET_exact:
1537       CI.EltSize = 4;
1538       if (findMatchingInst(CI)) {
1539         Modified = true;
1540         I = mergeBufferStorePair(CI);
1541         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1542       } else {
1543         ++I;
1544       }
1545       continue;
1546     }
1547 
1548     ++I;
1549   }
1550 
1551   return Modified;
1552 }
1553 
1554 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1555   if (skipFunction(MF.getFunction()))
1556     return false;
1557 
1558   STM = &MF.getSubtarget<GCNSubtarget>();
1559   if (!STM->loadStoreOptEnabled())
1560     return false;
1561 
1562   TII = STM->getInstrInfo();
1563   TRI = &TII->getRegisterInfo();
1564 
1565   MRI = &MF.getRegInfo();
1566   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1567 
1568   assert(MRI->isSSA() && "Must be run on SSA");
1569 
1570   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1571 
1572   bool Modified = false;
1573 
1574   for (MachineBasicBlock &MBB : MF) {
1575     do {
1576       OptimizeAgain = false;
1577       Modified |= optimizeBlock(MBB);
1578     } while (OptimizeAgain);
1579   }
1580 
1581   return Modified;
1582 }
1583