12cab237bSDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
28f0fd8f6SDimitry Andric //
38f0fd8f6SDimitry Andric // The LLVM Compiler Infrastructure
48f0fd8f6SDimitry Andric //
58f0fd8f6SDimitry Andric // This file is distributed under the University of Illinois Open Source
68f0fd8f6SDimitry Andric // License. See LICENSE.TXT for details.
78f0fd8f6SDimitry Andric //
88f0fd8f6SDimitry Andric //===----------------------------------------------------------------------===//
98f0fd8f6SDimitry Andric //
108f0fd8f6SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
118f0fd8f6SDimitry Andric // This will fuse operations such as
128f0fd8f6SDimitry Andric // ds_read_b32 v0, v2 offset:16
138f0fd8f6SDimitry Andric // ds_read_b32 v1, v2 offset:32
148f0fd8f6SDimitry Andric // ==>
158f0fd8f6SDimitry Andric // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
168f0fd8f6SDimitry Andric //
172cab237bSDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
182cab237bSDimitry Andric // s_buffer_load_dword s4, s[0:3], 4
192cab237bSDimitry Andric // s_buffer_load_dword s5, s[0:3], 8
202cab237bSDimitry Andric // ==>
212cab237bSDimitry Andric // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
222cab237bSDimitry Andric //
23*b5893f02SDimitry Andric // This pass also tries to promote constant offset to the immediate by
24*b5893f02SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
25*b5893f02SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
26*b5893f02SDimitry Andric // to the immediate.
27*b5893f02SDimitry Andric // E.g.
28*b5893f02SDimitry Andric // s_movk_i32 s0, 0x1800
29*b5893f02SDimitry Andric // v_add_co_u32_e32 v0, vcc, s0, v2
30*b5893f02SDimitry Andric // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31*b5893f02SDimitry Andric //
32*b5893f02SDimitry Andric // s_movk_i32 s0, 0x1000
33*b5893f02SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
34*b5893f02SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
35*b5893f02SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
36*b5893f02SDimitry Andric // global_load_dwordx2 v[0:1], v[0:1], off
37*b5893f02SDimitry Andric // =>
38*b5893f02SDimitry Andric // s_movk_i32 s0, 0x1000
39*b5893f02SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
40*b5893f02SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
41*b5893f02SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
42*b5893f02SDimitry Andric // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
438f0fd8f6SDimitry Andric //
448f0fd8f6SDimitry Andric // Future improvements:
458f0fd8f6SDimitry Andric //
468f0fd8f6SDimitry Andric // - This currently relies on the scheduler to place loads and stores next to
478f0fd8f6SDimitry Andric // each other, and then only merges adjacent pairs of instructions. It would
488f0fd8f6SDimitry Andric // be good to be more flexible with interleaved instructions, and possibly run
498f0fd8f6SDimitry Andric // before scheduling. It currently missing stores of constants because loading
508f0fd8f6SDimitry Andric // the constant into the data register is placed between the stores, although
518f0fd8f6SDimitry Andric // this is arguably a scheduling problem.
528f0fd8f6SDimitry Andric //
538f0fd8f6SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
548f0fd8f6SDimitry Andric // one pair, and recomputes live intervals and moves on to the next pair. It
553ca95b02SDimitry Andric // would be better to compute a list of all merges that need to occur.
568f0fd8f6SDimitry Andric //
578f0fd8f6SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
588f0fd8f6SDimitry Andric // cluster of loads have offsets that are too large to fit in the 8-bit
598f0fd8f6SDimitry Andric // offsets, but are close enough to fit in the 8 bits, we can add to the base
608f0fd8f6SDimitry Andric // pointer and use the new reduced offsets.
618f0fd8f6SDimitry Andric //
628f0fd8f6SDimitry Andric //===----------------------------------------------------------------------===//
638f0fd8f6SDimitry Andric
648f0fd8f6SDimitry Andric #include "AMDGPU.h"
653ca95b02SDimitry Andric #include "AMDGPUSubtarget.h"
66*b5893f02SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
678f0fd8f6SDimitry Andric #include "SIInstrInfo.h"
688f0fd8f6SDimitry Andric #include "SIRegisterInfo.h"
697a7e6055SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
707a7e6055SDimitry Andric #include "llvm/ADT/ArrayRef.h"
717a7e6055SDimitry Andric #include "llvm/ADT/SmallVector.h"
727a7e6055SDimitry Andric #include "llvm/ADT/StringRef.h"
737a7e6055SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
747a7e6055SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
758f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
768f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
777a7e6055SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
788f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
797a7e6055SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
808f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
817a7e6055SDimitry Andric #include "llvm/IR/DebugLoc.h"
827a7e6055SDimitry Andric #include "llvm/Pass.h"
838f0fd8f6SDimitry Andric #include "llvm/Support/Debug.h"
847a7e6055SDimitry Andric #include "llvm/Support/MathExtras.h"
858f0fd8f6SDimitry Andric #include "llvm/Support/raw_ostream.h"
862cab237bSDimitry Andric #include <algorithm>
877a7e6055SDimitry Andric #include <cassert>
882cab237bSDimitry Andric #include <cstdlib>
897a7e6055SDimitry Andric #include <iterator>
907a7e6055SDimitry Andric #include <utility>
918f0fd8f6SDimitry Andric
928f0fd8f6SDimitry Andric using namespace llvm;
938f0fd8f6SDimitry Andric
948f0fd8f6SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
958f0fd8f6SDimitry Andric
968f0fd8f6SDimitry Andric namespace {
972cab237bSDimitry Andric enum InstClassEnum {
98*b5893f02SDimitry Andric UNKNOWN,
99*b5893f02SDimitry Andric DS_READ,
100*b5893f02SDimitry Andric DS_WRITE,
1012cab237bSDimitry Andric S_BUFFER_LOAD_IMM,
102*b5893f02SDimitry Andric BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
103*b5893f02SDimitry Andric BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
104*b5893f02SDimitry Andric BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
105*b5893f02SDimitry Andric BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
106*b5893f02SDimitry Andric BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
107*b5893f02SDimitry Andric BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
108*b5893f02SDimitry Andric BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
109*b5893f02SDimitry Andric BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
1102cab237bSDimitry Andric };
1117a7e6055SDimitry Andric
112*b5893f02SDimitry Andric enum RegisterEnum {
113*b5893f02SDimitry Andric SBASE = 0x1,
114*b5893f02SDimitry Andric SRSRC = 0x2,
115*b5893f02SDimitry Andric SOFFSET = 0x4,
116*b5893f02SDimitry Andric VADDR = 0x8,
117*b5893f02SDimitry Andric ADDR = 0x10,
118*b5893f02SDimitry Andric };
119*b5893f02SDimitry Andric
120*b5893f02SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
1212cab237bSDimitry Andric struct CombineInfo {
1227a7e6055SDimitry Andric MachineBasicBlock::iterator I;
1237a7e6055SDimitry Andric MachineBasicBlock::iterator Paired;
1247a7e6055SDimitry Andric unsigned EltSize;
1257a7e6055SDimitry Andric unsigned Offset0;
1267a7e6055SDimitry Andric unsigned Offset1;
127*b5893f02SDimitry Andric unsigned Width0;
128*b5893f02SDimitry Andric unsigned Width1;
1297a7e6055SDimitry Andric unsigned BaseOff;
1302cab237bSDimitry Andric InstClassEnum InstClass;
1312cab237bSDimitry Andric bool GLC0;
1322cab237bSDimitry Andric bool GLC1;
1332cab237bSDimitry Andric bool SLC0;
1342cab237bSDimitry Andric bool SLC1;
1357a7e6055SDimitry Andric bool UseST64;
1367a7e6055SDimitry Andric SmallVector<MachineInstr *, 8> InstsToMove;
1372cab237bSDimitry Andric };
1387a7e6055SDimitry Andric
139*b5893f02SDimitry Andric struct BaseRegisters {
140*b5893f02SDimitry Andric unsigned LoReg = 0;
141*b5893f02SDimitry Andric unsigned HiReg = 0;
142*b5893f02SDimitry Andric
143*b5893f02SDimitry Andric unsigned LoSubReg = 0;
144*b5893f02SDimitry Andric unsigned HiSubReg = 0;
145*b5893f02SDimitry Andric };
146*b5893f02SDimitry Andric
147*b5893f02SDimitry Andric struct MemAddress {
148*b5893f02SDimitry Andric BaseRegisters Base;
149*b5893f02SDimitry Andric int64_t Offset = 0;
150*b5893f02SDimitry Andric };
151*b5893f02SDimitry Andric
152*b5893f02SDimitry Andric using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
153*b5893f02SDimitry Andric
1548f0fd8f6SDimitry Andric private:
1554ba319b5SDimitry Andric const GCNSubtarget *STM = nullptr;
1567a7e6055SDimitry Andric const SIInstrInfo *TII = nullptr;
1577a7e6055SDimitry Andric const SIRegisterInfo *TRI = nullptr;
1587a7e6055SDimitry Andric MachineRegisterInfo *MRI = nullptr;
1597a7e6055SDimitry Andric AliasAnalysis *AA = nullptr;
160*b5893f02SDimitry Andric bool OptimizeAgain;
1618f0fd8f6SDimitry Andric
1627a7e6055SDimitry Andric static bool offsetsCanBeCombined(CombineInfo &CI);
163*b5893f02SDimitry Andric static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
164*b5893f02SDimitry Andric static unsigned getNewOpcode(const CombineInfo &CI);
165*b5893f02SDimitry Andric static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
166*b5893f02SDimitry Andric const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
167*b5893f02SDimitry Andric unsigned getOpcodeWidth(const MachineInstr &MI);
168*b5893f02SDimitry Andric InstClassEnum getInstClass(unsigned Opc);
169*b5893f02SDimitry Andric unsigned getRegs(unsigned Opc);
1708f0fd8f6SDimitry Andric
1712cab237bSDimitry Andric bool findMatchingInst(CombineInfo &CI);
1728f0fd8f6SDimitry Andric
1732cab237bSDimitry Andric unsigned read2Opcode(unsigned EltSize) const;
1742cab237bSDimitry Andric unsigned read2ST64Opcode(unsigned EltSize) const;
1757a7e6055SDimitry Andric MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
1768f0fd8f6SDimitry Andric
1772cab237bSDimitry Andric unsigned write2Opcode(unsigned EltSize) const;
1782cab237bSDimitry Andric unsigned write2ST64Opcode(unsigned EltSize) const;
1797a7e6055SDimitry Andric MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
1802cab237bSDimitry Andric MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
1812cab237bSDimitry Andric MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
1822cab237bSDimitry Andric MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
1838f0fd8f6SDimitry Andric
184*b5893f02SDimitry Andric void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
185*b5893f02SDimitry Andric int32_t NewOffset);
186*b5893f02SDimitry Andric unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
187*b5893f02SDimitry Andric MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
188*b5893f02SDimitry Andric Optional<int32_t> extractConstOffset(const MachineOperand &Op);
189*b5893f02SDimitry Andric void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
190*b5893f02SDimitry Andric /// Promotes constant offset to the immediate by adjusting the base. It
191*b5893f02SDimitry Andric /// tries to use a base from the nearby instructions that allows it to have
192*b5893f02SDimitry Andric /// a 13bit constant offset which gets promoted to the immediate.
193*b5893f02SDimitry Andric bool promoteConstantOffsetToImm(MachineInstr &CI,
194*b5893f02SDimitry Andric MemInfoMap &Visited,
195*b5893f02SDimitry Andric SmallPtrSet<MachineInstr *, 4> &Promoted);
196*b5893f02SDimitry Andric
1978f0fd8f6SDimitry Andric public:
1988f0fd8f6SDimitry Andric static char ID;
1998f0fd8f6SDimitry Andric
SILoadStoreOptimizer()200d8866befSDimitry Andric SILoadStoreOptimizer() : MachineFunctionPass(ID) {
2018f0fd8f6SDimitry Andric initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
2028f0fd8f6SDimitry Andric }
2038f0fd8f6SDimitry Andric
2048f0fd8f6SDimitry Andric bool optimizeBlock(MachineBasicBlock &MBB);
2058f0fd8f6SDimitry Andric
2068f0fd8f6SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
2078f0fd8f6SDimitry Andric
getPassName() const2084ba319b5SDimitry Andric StringRef getPassName() const override { return "SI Load Store Optimizer"; }
2098f0fd8f6SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const2108f0fd8f6SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
2118f0fd8f6SDimitry Andric AU.setPreservesCFG();
212d88c1a5aSDimitry Andric AU.addRequired<AAResultsWrapperPass>();
2138f0fd8f6SDimitry Andric
2148f0fd8f6SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
2158f0fd8f6SDimitry Andric }
2168f0fd8f6SDimitry Andric };
2178f0fd8f6SDimitry Andric
2187a7e6055SDimitry Andric } // end anonymous namespace.
2198f0fd8f6SDimitry Andric
2208f0fd8f6SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
2214ba319b5SDimitry Andric "SI Load Store Optimizer", false, false)
222d88c1a5aSDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
223*b5893f02SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
224*b5893f02SDimitry Andric false, false)
2258f0fd8f6SDimitry Andric
2268f0fd8f6SDimitry Andric char SILoadStoreOptimizer::ID = 0;
2278f0fd8f6SDimitry Andric
2288f0fd8f6SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
2298f0fd8f6SDimitry Andric
createSILoadStoreOptimizerPass()230d8866befSDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
231d8866befSDimitry Andric return new SILoadStoreOptimizer();
2328f0fd8f6SDimitry Andric }
2338f0fd8f6SDimitry Andric
moveInstsAfter(MachineBasicBlock::iterator I,ArrayRef<MachineInstr * > InstsToMove)234d88c1a5aSDimitry Andric static void moveInstsAfter(MachineBasicBlock::iterator I,
235d88c1a5aSDimitry Andric ArrayRef<MachineInstr *> InstsToMove) {
236d88c1a5aSDimitry Andric MachineBasicBlock *MBB = I->getParent();
237d88c1a5aSDimitry Andric ++I;
238d88c1a5aSDimitry Andric for (MachineInstr *MI : InstsToMove) {
239d88c1a5aSDimitry Andric MI->removeFromParent();
240d88c1a5aSDimitry Andric MBB->insert(I, MI);
241d88c1a5aSDimitry Andric }
242d88c1a5aSDimitry Andric }
243d88c1a5aSDimitry Andric
addDefsUsesToList(const MachineInstr & MI,DenseSet<unsigned> & RegDefs,DenseSet<unsigned> & PhysRegUses)2444ba319b5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
2454ba319b5SDimitry Andric DenseSet<unsigned> &RegDefs,
2464ba319b5SDimitry Andric DenseSet<unsigned> &PhysRegUses) {
2474ba319b5SDimitry Andric for (const MachineOperand &Op : MI.operands()) {
2484ba319b5SDimitry Andric if (Op.isReg()) {
2494ba319b5SDimitry Andric if (Op.isDef())
2504ba319b5SDimitry Andric RegDefs.insert(Op.getReg());
2514ba319b5SDimitry Andric else if (Op.readsReg() &&
2524ba319b5SDimitry Andric TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
2534ba319b5SDimitry Andric PhysRegUses.insert(Op.getReg());
2544ba319b5SDimitry Andric }
2554ba319b5SDimitry Andric }
256d88c1a5aSDimitry Andric }
257d88c1a5aSDimitry Andric
memAccessesCanBeReordered(MachineBasicBlock::iterator A,MachineBasicBlock::iterator B,const SIInstrInfo * TII,AliasAnalysis * AA)2587a7e6055SDimitry Andric static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
259d88c1a5aSDimitry Andric MachineBasicBlock::iterator B,
260d88c1a5aSDimitry Andric const SIInstrInfo *TII,
2617a7e6055SDimitry Andric AliasAnalysis *AA) {
262d88c1a5aSDimitry Andric // RAW or WAR - cannot reorder
263d88c1a5aSDimitry Andric // WAW - cannot reorder
264d88c1a5aSDimitry Andric // RAR - safe to reorder
2652cab237bSDimitry Andric return !(A->mayStore() || B->mayStore()) ||
2662cab237bSDimitry Andric TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
267d88c1a5aSDimitry Andric }
268d88c1a5aSDimitry Andric
269d88c1a5aSDimitry Andric // Add MI and its defs to the lists if MI reads one of the defs that are
270d88c1a5aSDimitry Andric // already in the list. Returns true in that case.
addToListsIfDependent(MachineInstr & MI,DenseSet<unsigned> & RegDefs,DenseSet<unsigned> & PhysRegUses,SmallVectorImpl<MachineInstr * > & Insts)271*b5893f02SDimitry Andric static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
2724ba319b5SDimitry Andric DenseSet<unsigned> &PhysRegUses,
273d88c1a5aSDimitry Andric SmallVectorImpl<MachineInstr *> &Insts) {
2742cab237bSDimitry Andric for (MachineOperand &Use : MI.operands()) {
2752cab237bSDimitry Andric // If one of the defs is read, then there is a use of Def between I and the
2762cab237bSDimitry Andric // instruction that I will potentially be merged with. We will need to move
2772cab237bSDimitry Andric // this instruction after the merged instructions.
2784ba319b5SDimitry Andric //
2794ba319b5SDimitry Andric // Similarly, if there is a def which is read by an instruction that is to
2804ba319b5SDimitry Andric // be moved for merging, then we need to move the def-instruction as well.
2814ba319b5SDimitry Andric // This can only happen for physical registers such as M0; virtual
2824ba319b5SDimitry Andric // registers are in SSA form.
2834ba319b5SDimitry Andric if (Use.isReg() &&
2844ba319b5SDimitry Andric ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
2854ba319b5SDimitry Andric (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
2864ba319b5SDimitry Andric PhysRegUses.count(Use.getReg())))) {
287d88c1a5aSDimitry Andric Insts.push_back(&MI);
2884ba319b5SDimitry Andric addDefsUsesToList(MI, RegDefs, PhysRegUses);
289d88c1a5aSDimitry Andric return true;
290d88c1a5aSDimitry Andric }
291d88c1a5aSDimitry Andric }
292d88c1a5aSDimitry Andric
293d88c1a5aSDimitry Andric return false;
294d88c1a5aSDimitry Andric }
295d88c1a5aSDimitry Andric
canMoveInstsAcrossMemOp(MachineInstr & MemOp,ArrayRef<MachineInstr * > InstsToMove,const SIInstrInfo * TII,AliasAnalysis * AA)296*b5893f02SDimitry Andric static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
297d88c1a5aSDimitry Andric ArrayRef<MachineInstr *> InstsToMove,
298*b5893f02SDimitry Andric const SIInstrInfo *TII, AliasAnalysis *AA) {
299d88c1a5aSDimitry Andric assert(MemOp.mayLoadOrStore());
300d88c1a5aSDimitry Andric
301d88c1a5aSDimitry Andric for (MachineInstr *InstToMove : InstsToMove) {
302d88c1a5aSDimitry Andric if (!InstToMove->mayLoadOrStore())
303d88c1a5aSDimitry Andric continue;
304d88c1a5aSDimitry Andric if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
305d88c1a5aSDimitry Andric return false;
306d88c1a5aSDimitry Andric }
307d88c1a5aSDimitry Andric return true;
308d88c1a5aSDimitry Andric }
309d88c1a5aSDimitry Andric
offsetsCanBeCombined(CombineInfo & CI)3107a7e6055SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
3118f0fd8f6SDimitry Andric // XXX - Would the same offset be OK? Is there any reason this would happen or
3128f0fd8f6SDimitry Andric // be useful?
3137a7e6055SDimitry Andric if (CI.Offset0 == CI.Offset1)
3148f0fd8f6SDimitry Andric return false;
3158f0fd8f6SDimitry Andric
3168f0fd8f6SDimitry Andric // This won't be valid if the offset isn't aligned.
3177a7e6055SDimitry Andric if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
3188f0fd8f6SDimitry Andric return false;
3198f0fd8f6SDimitry Andric
3207a7e6055SDimitry Andric unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
3217a7e6055SDimitry Andric unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
3227a7e6055SDimitry Andric CI.UseST64 = false;
3237a7e6055SDimitry Andric CI.BaseOff = 0;
3248f0fd8f6SDimitry Andric
3252cab237bSDimitry Andric // Handle SMEM and VMEM instructions.
326*b5893f02SDimitry Andric if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327*b5893f02SDimitry Andric return (EltOffset0 + CI.Width0 == EltOffset1 ||
328*b5893f02SDimitry Andric EltOffset1 + CI.Width1 == EltOffset0) &&
3292cab237bSDimitry Andric CI.GLC0 == CI.GLC1 &&
3302cab237bSDimitry Andric (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
3312cab237bSDimitry Andric }
3322cab237bSDimitry Andric
3338f0fd8f6SDimitry Andric // If the offset in elements doesn't fit in 8-bits, we might be able to use
3348f0fd8f6SDimitry Andric // the stride 64 versions.
3357a7e6055SDimitry Andric if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
3367a7e6055SDimitry Andric isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
3377a7e6055SDimitry Andric CI.Offset0 = EltOffset0 / 64;
3387a7e6055SDimitry Andric CI.Offset1 = EltOffset1 / 64;
3397a7e6055SDimitry Andric CI.UseST64 = true;
3407a7e6055SDimitry Andric return true;
3418f0fd8f6SDimitry Andric }
3428f0fd8f6SDimitry Andric
3437a7e6055SDimitry Andric // Check if the new offsets fit in the reduced 8-bit range.
3447a7e6055SDimitry Andric if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
3457a7e6055SDimitry Andric CI.Offset0 = EltOffset0;
3467a7e6055SDimitry Andric CI.Offset1 = EltOffset1;
3477a7e6055SDimitry Andric return true;
3487a7e6055SDimitry Andric }
3497a7e6055SDimitry Andric
3507a7e6055SDimitry Andric // Try to shift base address to decrease offsets.
3517a7e6055SDimitry Andric unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
3527a7e6055SDimitry Andric CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
3537a7e6055SDimitry Andric
3547a7e6055SDimitry Andric if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
3557a7e6055SDimitry Andric CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
3567a7e6055SDimitry Andric CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
3577a7e6055SDimitry Andric CI.UseST64 = true;
3587a7e6055SDimitry Andric return true;
3597a7e6055SDimitry Andric }
3607a7e6055SDimitry Andric
3617a7e6055SDimitry Andric if (isUInt<8>(OffsetDiff)) {
3627a7e6055SDimitry Andric CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
3637a7e6055SDimitry Andric CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
3647a7e6055SDimitry Andric return true;
3657a7e6055SDimitry Andric }
3667a7e6055SDimitry Andric
3677a7e6055SDimitry Andric return false;
3687a7e6055SDimitry Andric }
3697a7e6055SDimitry Andric
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI)370*b5893f02SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
371*b5893f02SDimitry Andric const CombineInfo &CI) {
372*b5893f02SDimitry Andric const unsigned Width = (CI.Width0 + CI.Width1);
373*b5893f02SDimitry Andric switch (CI.InstClass) {
374*b5893f02SDimitry Andric default:
375*b5893f02SDimitry Andric return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
376*b5893f02SDimitry Andric case S_BUFFER_LOAD_IMM:
377*b5893f02SDimitry Andric switch (Width) {
378*b5893f02SDimitry Andric default:
379*b5893f02SDimitry Andric return false;
380*b5893f02SDimitry Andric case 2:
381*b5893f02SDimitry Andric case 4:
382*b5893f02SDimitry Andric return true;
383*b5893f02SDimitry Andric }
384*b5893f02SDimitry Andric }
385*b5893f02SDimitry Andric }
386*b5893f02SDimitry Andric
getOpcodeWidth(const MachineInstr & MI)387*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
388*b5893f02SDimitry Andric const unsigned Opc = MI.getOpcode();
389*b5893f02SDimitry Andric
390*b5893f02SDimitry Andric if (TII->isMUBUF(MI)) {
391*b5893f02SDimitry Andric return AMDGPU::getMUBUFDwords(Opc);
392*b5893f02SDimitry Andric }
393*b5893f02SDimitry Andric
394*b5893f02SDimitry Andric switch (Opc) {
395*b5893f02SDimitry Andric default:
396*b5893f02SDimitry Andric return 0;
397*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
398*b5893f02SDimitry Andric return 1;
399*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
400*b5893f02SDimitry Andric return 2;
401*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402*b5893f02SDimitry Andric return 4;
403*b5893f02SDimitry Andric }
404*b5893f02SDimitry Andric }
405*b5893f02SDimitry Andric
getInstClass(unsigned Opc)406*b5893f02SDimitry Andric InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
407*b5893f02SDimitry Andric if (TII->isMUBUF(Opc)) {
408*b5893f02SDimitry Andric const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
409*b5893f02SDimitry Andric
410*b5893f02SDimitry Andric // If we couldn't identify the opcode, bail out.
411*b5893f02SDimitry Andric if (baseOpcode == -1) {
412*b5893f02SDimitry Andric return UNKNOWN;
413*b5893f02SDimitry Andric }
414*b5893f02SDimitry Andric
415*b5893f02SDimitry Andric switch (baseOpcode) {
416*b5893f02SDimitry Andric default:
417*b5893f02SDimitry Andric return UNKNOWN;
418*b5893f02SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419*b5893f02SDimitry Andric return BUFFER_LOAD_OFFEN;
420*b5893f02SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421*b5893f02SDimitry Andric return BUFFER_LOAD_OFFSET;
422*b5893f02SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423*b5893f02SDimitry Andric return BUFFER_STORE_OFFEN;
424*b5893f02SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425*b5893f02SDimitry Andric return BUFFER_STORE_OFFSET;
426*b5893f02SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427*b5893f02SDimitry Andric return BUFFER_LOAD_OFFEN_exact;
428*b5893f02SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429*b5893f02SDimitry Andric return BUFFER_LOAD_OFFSET_exact;
430*b5893f02SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431*b5893f02SDimitry Andric return BUFFER_STORE_OFFEN_exact;
432*b5893f02SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433*b5893f02SDimitry Andric return BUFFER_STORE_OFFSET_exact;
434*b5893f02SDimitry Andric }
435*b5893f02SDimitry Andric }
436*b5893f02SDimitry Andric
437*b5893f02SDimitry Andric switch (Opc) {
438*b5893f02SDimitry Andric default:
439*b5893f02SDimitry Andric return UNKNOWN;
440*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443*b5893f02SDimitry Andric return S_BUFFER_LOAD_IMM;
444*b5893f02SDimitry Andric case AMDGPU::DS_READ_B32:
445*b5893f02SDimitry Andric case AMDGPU::DS_READ_B64:
446*b5893f02SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
447*b5893f02SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
448*b5893f02SDimitry Andric return DS_READ;
449*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B32:
450*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B64:
451*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
452*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
453*b5893f02SDimitry Andric return DS_WRITE;
454*b5893f02SDimitry Andric }
455*b5893f02SDimitry Andric }
456*b5893f02SDimitry Andric
getRegs(unsigned Opc)457*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
458*b5893f02SDimitry Andric if (TII->isMUBUF(Opc)) {
459*b5893f02SDimitry Andric unsigned result = 0;
460*b5893f02SDimitry Andric
461*b5893f02SDimitry Andric if (AMDGPU::getMUBUFHasVAddr(Opc)) {
462*b5893f02SDimitry Andric result |= VADDR;
463*b5893f02SDimitry Andric }
464*b5893f02SDimitry Andric
465*b5893f02SDimitry Andric if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
466*b5893f02SDimitry Andric result |= SRSRC;
467*b5893f02SDimitry Andric }
468*b5893f02SDimitry Andric
469*b5893f02SDimitry Andric if (AMDGPU::getMUBUFHasSoffset(Opc)) {
470*b5893f02SDimitry Andric result |= SOFFSET;
471*b5893f02SDimitry Andric }
472*b5893f02SDimitry Andric
473*b5893f02SDimitry Andric return result;
474*b5893f02SDimitry Andric }
475*b5893f02SDimitry Andric
476*b5893f02SDimitry Andric switch (Opc) {
477*b5893f02SDimitry Andric default:
478*b5893f02SDimitry Andric return 0;
479*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481*b5893f02SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
482*b5893f02SDimitry Andric return SBASE;
483*b5893f02SDimitry Andric case AMDGPU::DS_READ_B32:
484*b5893f02SDimitry Andric case AMDGPU::DS_READ_B64:
485*b5893f02SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
486*b5893f02SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
487*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B32:
488*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B64:
489*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
490*b5893f02SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
491*b5893f02SDimitry Andric return ADDR;
492*b5893f02SDimitry Andric }
493*b5893f02SDimitry Andric }
494*b5893f02SDimitry Andric
findMatchingInst(CombineInfo & CI)4952cab237bSDimitry Andric bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
4962cab237bSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
4972cab237bSDimitry Andric MachineBasicBlock::iterator E = MBB->end();
4987a7e6055SDimitry Andric MachineBasicBlock::iterator MBBI = CI.I;
4992cab237bSDimitry Andric
500*b5893f02SDimitry Andric const unsigned Opc = CI.I->getOpcode();
501*b5893f02SDimitry Andric const InstClassEnum InstClass = getInstClass(Opc);
502*b5893f02SDimitry Andric
503*b5893f02SDimitry Andric if (InstClass == UNKNOWN) {
504*b5893f02SDimitry Andric return false;
505*b5893f02SDimitry Andric }
506*b5893f02SDimitry Andric
507*b5893f02SDimitry Andric const unsigned Regs = getRegs(Opc);
508*b5893f02SDimitry Andric
509*b5893f02SDimitry Andric unsigned AddrOpName[5] = {0};
510*b5893f02SDimitry Andric int AddrIdx[5];
511*b5893f02SDimitry Andric const MachineOperand *AddrReg[5];
5122cab237bSDimitry Andric unsigned NumAddresses = 0;
5132cab237bSDimitry Andric
514*b5893f02SDimitry Andric if (Regs & ADDR) {
5152cab237bSDimitry Andric AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
516*b5893f02SDimitry Andric }
517*b5893f02SDimitry Andric
518*b5893f02SDimitry Andric if (Regs & SBASE) {
5192cab237bSDimitry Andric AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
520*b5893f02SDimitry Andric }
521*b5893f02SDimitry Andric
522*b5893f02SDimitry Andric if (Regs & SRSRC) {
5232cab237bSDimitry Andric AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
524*b5893f02SDimitry Andric }
525*b5893f02SDimitry Andric
526*b5893f02SDimitry Andric if (Regs & SOFFSET) {
527*b5893f02SDimitry Andric AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
528*b5893f02SDimitry Andric }
529*b5893f02SDimitry Andric
530*b5893f02SDimitry Andric if (Regs & VADDR) {
5312cab237bSDimitry Andric AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
5322cab237bSDimitry Andric }
5332cab237bSDimitry Andric
5342cab237bSDimitry Andric for (unsigned i = 0; i < NumAddresses; i++) {
5352cab237bSDimitry Andric AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
5362cab237bSDimitry Andric AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
5372cab237bSDimitry Andric
538*b5893f02SDimitry Andric // We only ever merge operations with the same base address register, so
539*b5893f02SDimitry Andric // don't bother scanning forward if there are no other uses.
5402cab237bSDimitry Andric if (AddrReg[i]->isReg() &&
5412cab237bSDimitry Andric (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
5422cab237bSDimitry Andric MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
5432cab237bSDimitry Andric return false;
5442cab237bSDimitry Andric }
5452cab237bSDimitry Andric
5468f0fd8f6SDimitry Andric ++MBBI;
5478f0fd8f6SDimitry Andric
5484ba319b5SDimitry Andric DenseSet<unsigned> RegDefsToMove;
5494ba319b5SDimitry Andric DenseSet<unsigned> PhysRegUsesToMove;
5504ba319b5SDimitry Andric addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
551d88c1a5aSDimitry Andric
552d88c1a5aSDimitry Andric for (; MBBI != E; ++MBBI) {
553*b5893f02SDimitry Andric const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
554*b5893f02SDimitry Andric
555*b5893f02SDimitry Andric if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556*b5893f02SDimitry Andric (IsDS && (MBBI->getOpcode() != Opc))) {
557d88c1a5aSDimitry Andric // This is not a matching DS instruction, but we can keep looking as
558d88c1a5aSDimitry Andric // long as one of these conditions are met:
559d88c1a5aSDimitry Andric // 1. It is safe to move I down past MBBI.
560d88c1a5aSDimitry Andric // 2. It is safe to move MBBI down past the instruction that I will
561d88c1a5aSDimitry Andric // be merged into.
562d88c1a5aSDimitry Andric
5632cab237bSDimitry Andric if (MBBI->hasUnmodeledSideEffects()) {
564d88c1a5aSDimitry Andric // We can't re-order this instruction with respect to other memory
5652cab237bSDimitry Andric // operations, so we fail both conditions mentioned above.
5667a7e6055SDimitry Andric return false;
5672cab237bSDimitry Andric }
5688f0fd8f6SDimitry Andric
569d88c1a5aSDimitry Andric if (MBBI->mayLoadOrStore() &&
5702cab237bSDimitry Andric (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
5712cab237bSDimitry Andric !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
572d88c1a5aSDimitry Andric // We fail condition #1, but we may still be able to satisfy condition
573d88c1a5aSDimitry Andric // #2. Add this instruction to the move list and then we will check
574d88c1a5aSDimitry Andric // if condition #2 holds once we have selected the matching instruction.
5757a7e6055SDimitry Andric CI.InstsToMove.push_back(&*MBBI);
5764ba319b5SDimitry Andric addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
577d88c1a5aSDimitry Andric continue;
578d88c1a5aSDimitry Andric }
579d88c1a5aSDimitry Andric
580d88c1a5aSDimitry Andric // When we match I with another DS instruction we will be moving I down
581d88c1a5aSDimitry Andric // to the location of the matched instruction any uses of I will need to
582d88c1a5aSDimitry Andric // be moved down as well.
5834ba319b5SDimitry Andric addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
5844ba319b5SDimitry Andric CI.InstsToMove);
585d88c1a5aSDimitry Andric continue;
586d88c1a5aSDimitry Andric }
587d88c1a5aSDimitry Andric
5888f0fd8f6SDimitry Andric // Don't merge volatiles.
5898f0fd8f6SDimitry Andric if (MBBI->hasOrderedMemoryRef())
5907a7e6055SDimitry Andric return false;
5918f0fd8f6SDimitry Andric
592d88c1a5aSDimitry Andric // Handle a case like
593d88c1a5aSDimitry Andric // DS_WRITE_B32 addr, v, idx0
594d88c1a5aSDimitry Andric // w = DS_READ_B32 addr, idx0
595d88c1a5aSDimitry Andric // DS_WRITE_B32 addr, f(w), idx1
596d88c1a5aSDimitry Andric // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
597d88c1a5aSDimitry Andric // merging of the two writes.
5984ba319b5SDimitry Andric if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
5994ba319b5SDimitry Andric CI.InstsToMove))
600d88c1a5aSDimitry Andric continue;
601d88c1a5aSDimitry Andric
6022cab237bSDimitry Andric bool Match = true;
6032cab237bSDimitry Andric for (unsigned i = 0; i < NumAddresses; i++) {
6042cab237bSDimitry Andric const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
6052cab237bSDimitry Andric
6062cab237bSDimitry Andric if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
6072cab237bSDimitry Andric if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
6082cab237bSDimitry Andric AddrReg[i]->getImm() != AddrRegNext.getImm()) {
6092cab237bSDimitry Andric Match = false;
6102cab237bSDimitry Andric break;
6112cab237bSDimitry Andric }
6122cab237bSDimitry Andric continue;
6132cab237bSDimitry Andric }
6148f0fd8f6SDimitry Andric
615*b5893f02SDimitry Andric // Check same base pointer. Be careful of subregisters, which can occur
616*b5893f02SDimitry Andric // with vectors of pointers.
6172cab237bSDimitry Andric if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
6182cab237bSDimitry Andric AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
6192cab237bSDimitry Andric Match = false;
6202cab237bSDimitry Andric break;
6212cab237bSDimitry Andric }
6222cab237bSDimitry Andric }
6232cab237bSDimitry Andric
6242cab237bSDimitry Andric if (Match) {
625*b5893f02SDimitry Andric int OffsetIdx =
626*b5893f02SDimitry Andric AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
6272cab237bSDimitry Andric CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628*b5893f02SDimitry Andric CI.Width0 = getOpcodeWidth(*CI.I);
6292cab237bSDimitry Andric CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630*b5893f02SDimitry Andric CI.Width1 = getOpcodeWidth(*MBBI);
6317a7e6055SDimitry Andric CI.Paired = MBBI;
6328f0fd8f6SDimitry Andric
633*b5893f02SDimitry Andric if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
6342cab237bSDimitry Andric CI.Offset0 &= 0xffff;
6352cab237bSDimitry Andric CI.Offset1 &= 0xffff;
6362cab237bSDimitry Andric } else {
6372cab237bSDimitry Andric CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
6382cab237bSDimitry Andric CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
6392cab237bSDimitry Andric if (CI.InstClass != S_BUFFER_LOAD_IMM) {
6402cab237bSDimitry Andric CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
6412cab237bSDimitry Andric CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
6422cab237bSDimitry Andric }
6432cab237bSDimitry Andric }
6442cab237bSDimitry Andric
6458f0fd8f6SDimitry Andric // Check both offsets fit in the reduced range.
646d88c1a5aSDimitry Andric // We also need to go through the list of instructions that we plan to
647d88c1a5aSDimitry Andric // move and make sure they are all safe to move down past the merged
648d88c1a5aSDimitry Andric // instruction.
649*b5893f02SDimitry Andric if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
6507a7e6055SDimitry Andric if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
6517a7e6055SDimitry Andric return true;
6528f0fd8f6SDimitry Andric }
6538f0fd8f6SDimitry Andric
654d88c1a5aSDimitry Andric // We've found a load/store that we couldn't merge for some reason.
655d88c1a5aSDimitry Andric // We could potentially keep looking, but we'd need to make sure that
656d88c1a5aSDimitry Andric // it was safe to move I and also all the instruction in InstsToMove
657d88c1a5aSDimitry Andric // down past this instruction.
6587a7e6055SDimitry Andric // check if we can move I across MBBI and if we can move all I's users
6597a7e6055SDimitry Andric if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
6607a7e6055SDimitry Andric !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
661d88c1a5aSDimitry Andric break;
662d88c1a5aSDimitry Andric }
6637a7e6055SDimitry Andric return false;
6648f0fd8f6SDimitry Andric }
6658f0fd8f6SDimitry Andric
read2Opcode(unsigned EltSize) const6662cab237bSDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
6672cab237bSDimitry Andric if (STM->ldsRequiresM0Init())
6682cab237bSDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
6692cab237bSDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
6702cab237bSDimitry Andric }
6712cab237bSDimitry Andric
read2ST64Opcode(unsigned EltSize) const6722cab237bSDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
6732cab237bSDimitry Andric if (STM->ldsRequiresM0Init())
6742cab237bSDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
6752cab237bSDimitry Andric
676*b5893f02SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
677*b5893f02SDimitry Andric : AMDGPU::DS_READ2ST64_B64_gfx9;
6782cab237bSDimitry Andric }
6792cab237bSDimitry Andric
680*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI)681*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
6827a7e6055SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
6838f0fd8f6SDimitry Andric
6848f0fd8f6SDimitry Andric // Be careful, since the addresses could be subregisters themselves in weird
6858f0fd8f6SDimitry Andric // cases, like vectors of pointers.
6867a7e6055SDimitry Andric const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
6878f0fd8f6SDimitry Andric
6887a7e6055SDimitry Andric const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
6897a7e6055SDimitry Andric const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
6908f0fd8f6SDimitry Andric
6917a7e6055SDimitry Andric unsigned NewOffset0 = CI.Offset0;
6927a7e6055SDimitry Andric unsigned NewOffset1 = CI.Offset1;
693*b5893f02SDimitry Andric unsigned Opc =
694*b5893f02SDimitry Andric CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
6958f0fd8f6SDimitry Andric
6967a7e6055SDimitry Andric unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
6977a7e6055SDimitry Andric unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
698d88c1a5aSDimitry Andric
699d88c1a5aSDimitry Andric if (NewOffset0 > NewOffset1) {
700d88c1a5aSDimitry Andric // Canonicalize the merged instruction so the smaller offset comes first.
701d88c1a5aSDimitry Andric std::swap(NewOffset0, NewOffset1);
702d88c1a5aSDimitry Andric std::swap(SubRegIdx0, SubRegIdx1);
703d88c1a5aSDimitry Andric }
704d88c1a5aSDimitry Andric
7058f0fd8f6SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
706*b5893f02SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
7078f0fd8f6SDimitry Andric
7088f0fd8f6SDimitry Andric const MCInstrDesc &Read2Desc = TII->get(Opc);
7098f0fd8f6SDimitry Andric
710*b5893f02SDimitry Andric const TargetRegisterClass *SuperRC =
711*b5893f02SDimitry Andric (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
7128f0fd8f6SDimitry Andric unsigned DestReg = MRI->createVirtualRegister(SuperRC);
7138f0fd8f6SDimitry Andric
7147a7e6055SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
7157a7e6055SDimitry Andric
7167a7e6055SDimitry Andric unsigned BaseReg = AddrReg->getReg();
717*b5893f02SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
7187a7e6055SDimitry Andric unsigned BaseRegFlags = 0;
7197a7e6055SDimitry Andric if (CI.BaseOff) {
7204ba319b5SDimitry Andric unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7214ba319b5SDimitry Andric BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
7224ba319b5SDimitry Andric .addImm(CI.BaseOff);
7234ba319b5SDimitry Andric
7247a7e6055SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7257a7e6055SDimitry Andric BaseRegFlags = RegState::Kill;
7262cab237bSDimitry Andric
7274ba319b5SDimitry Andric TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
7284ba319b5SDimitry Andric .addReg(ImmReg)
729*b5893f02SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg);
730*b5893f02SDimitry Andric BaseSubReg = 0;
7317a7e6055SDimitry Andric }
7327a7e6055SDimitry Andric
7337a7e6055SDimitry Andric MachineInstrBuilder Read2 =
7347a7e6055SDimitry Andric BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
735*b5893f02SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
7368f0fd8f6SDimitry Andric .addImm(NewOffset0) // offset0
7378f0fd8f6SDimitry Andric .addImm(NewOffset1) // offset1
7388f0fd8f6SDimitry Andric .addImm(0) // gds
739*b5893f02SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
7407a7e6055SDimitry Andric
741d88c1a5aSDimitry Andric (void)Read2;
7428f0fd8f6SDimitry Andric
743875ed548SDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
744875ed548SDimitry Andric
745875ed548SDimitry Andric // Copy to the old destination registers.
7467a7e6055SDimitry Andric BuildMI(*MBB, CI.Paired, DL, CopyDesc)
7477a7e6055SDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg.
748875ed548SDimitry Andric .addReg(DestReg, 0, SubRegIdx0);
7497a7e6055SDimitry Andric MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
7507a7e6055SDimitry Andric .add(*Dest1)
751875ed548SDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1);
752875ed548SDimitry Andric
7537a7e6055SDimitry Andric moveInstsAfter(Copy1, CI.InstsToMove);
754875ed548SDimitry Andric
7557a7e6055SDimitry Andric MachineBasicBlock::iterator Next = std::next(CI.I);
7567a7e6055SDimitry Andric CI.I->eraseFromParent();
7577a7e6055SDimitry Andric CI.Paired->eraseFromParent();
7588f0fd8f6SDimitry Andric
7594ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
760d88c1a5aSDimitry Andric return Next;
7618f0fd8f6SDimitry Andric }
7628f0fd8f6SDimitry Andric
write2Opcode(unsigned EltSize) const7632cab237bSDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
7642cab237bSDimitry Andric if (STM->ldsRequiresM0Init())
7652cab237bSDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
766*b5893f02SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
767*b5893f02SDimitry Andric : AMDGPU::DS_WRITE2_B64_gfx9;
7682cab237bSDimitry Andric }
7692cab237bSDimitry Andric
write2ST64Opcode(unsigned EltSize) const7702cab237bSDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
7712cab237bSDimitry Andric if (STM->ldsRequiresM0Init())
772*b5893f02SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
773*b5893f02SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64;
7742cab237bSDimitry Andric
775*b5893f02SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
776*b5893f02SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64_gfx9;
7772cab237bSDimitry Andric }
7782cab237bSDimitry Andric
779*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo & CI)780*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
7817a7e6055SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
7828f0fd8f6SDimitry Andric
7838f0fd8f6SDimitry Andric // Be sure to use .addOperand(), and not .addReg() with these. We want to be
7848f0fd8f6SDimitry Andric // sure we preserve the subregister index and any register flags set on them.
785*b5893f02SDimitry Andric const MachineOperand *AddrReg =
786*b5893f02SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
787*b5893f02SDimitry Andric const MachineOperand *Data0 =
788*b5893f02SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
789*b5893f02SDimitry Andric const MachineOperand *Data1 =
790*b5893f02SDimitry Andric TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
7918f0fd8f6SDimitry Andric
7927a7e6055SDimitry Andric unsigned NewOffset0 = CI.Offset0;
7937a7e6055SDimitry Andric unsigned NewOffset1 = CI.Offset1;
794*b5893f02SDimitry Andric unsigned Opc =
795*b5893f02SDimitry Andric CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
7968f0fd8f6SDimitry Andric
797d88c1a5aSDimitry Andric if (NewOffset0 > NewOffset1) {
798d88c1a5aSDimitry Andric // Canonicalize the merged instruction so the smaller offset comes first.
799d88c1a5aSDimitry Andric std::swap(NewOffset0, NewOffset1);
800d88c1a5aSDimitry Andric std::swap(Data0, Data1);
801d88c1a5aSDimitry Andric }
802d88c1a5aSDimitry Andric
8038f0fd8f6SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
804*b5893f02SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
8058f0fd8f6SDimitry Andric
8068f0fd8f6SDimitry Andric const MCInstrDesc &Write2Desc = TII->get(Opc);
8077a7e6055SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
8088f0fd8f6SDimitry Andric
8094ba319b5SDimitry Andric unsigned BaseReg = AddrReg->getReg();
810*b5893f02SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
8117a7e6055SDimitry Andric unsigned BaseRegFlags = 0;
8127a7e6055SDimitry Andric if (CI.BaseOff) {
8134ba319b5SDimitry Andric unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
8144ba319b5SDimitry Andric BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
8154ba319b5SDimitry Andric .addImm(CI.BaseOff);
8164ba319b5SDimitry Andric
8177a7e6055SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8187a7e6055SDimitry Andric BaseRegFlags = RegState::Kill;
8192cab237bSDimitry Andric
8204ba319b5SDimitry Andric TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
8214ba319b5SDimitry Andric .addReg(ImmReg)
822*b5893f02SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg);
823*b5893f02SDimitry Andric BaseSubReg = 0;
8247a7e6055SDimitry Andric }
8257a7e6055SDimitry Andric
8267a7e6055SDimitry Andric MachineInstrBuilder Write2 =
8277a7e6055SDimitry Andric BuildMI(*MBB, CI.Paired, DL, Write2Desc)
828*b5893f02SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
8297a7e6055SDimitry Andric .add(*Data0) // data0
8307a7e6055SDimitry Andric .add(*Data1) // data1
8318f0fd8f6SDimitry Andric .addImm(NewOffset0) // offset0
8328f0fd8f6SDimitry Andric .addImm(NewOffset1) // offset1
8338f0fd8f6SDimitry Andric .addImm(0) // gds
834*b5893f02SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
8358f0fd8f6SDimitry Andric
8367a7e6055SDimitry Andric moveInstsAfter(Write2, CI.InstsToMove);
8378f0fd8f6SDimitry Andric
8387a7e6055SDimitry Andric MachineBasicBlock::iterator Next = std::next(CI.I);
8397a7e6055SDimitry Andric CI.I->eraseFromParent();
8407a7e6055SDimitry Andric CI.Paired->eraseFromParent();
8418f0fd8f6SDimitry Andric
8424ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
843d88c1a5aSDimitry Andric return Next;
8448f0fd8f6SDimitry Andric }
8458f0fd8f6SDimitry Andric
846*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo & CI)847*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
8482cab237bSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
8492cab237bSDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
850*b5893f02SDimitry Andric const unsigned Opcode = getNewOpcode(CI);
8512cab237bSDimitry Andric
852*b5893f02SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
853*b5893f02SDimitry Andric
8542cab237bSDimitry Andric unsigned DestReg = MRI->createVirtualRegister(SuperRC);
8552cab237bSDimitry Andric unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
8562cab237bSDimitry Andric
8572cab237bSDimitry Andric BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
8582cab237bSDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
8592cab237bSDimitry Andric .addImm(MergedOffset) // offset
8602cab237bSDimitry Andric .addImm(CI.GLC0) // glc
861*b5893f02SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
8622cab237bSDimitry Andric
863*b5893f02SDimitry Andric std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
864*b5893f02SDimitry Andric const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
865*b5893f02SDimitry Andric const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
8662cab237bSDimitry Andric
8672cab237bSDimitry Andric // Copy to the old destination registers.
8682cab237bSDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
8692cab237bSDimitry Andric const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
8702cab237bSDimitry Andric const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
8712cab237bSDimitry Andric
8722cab237bSDimitry Andric BuildMI(*MBB, CI.Paired, DL, CopyDesc)
8732cab237bSDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg.
8742cab237bSDimitry Andric .addReg(DestReg, 0, SubRegIdx0);
8752cab237bSDimitry Andric MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
8762cab237bSDimitry Andric .add(*Dest1)
8772cab237bSDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1);
8782cab237bSDimitry Andric
8792cab237bSDimitry Andric moveInstsAfter(Copy1, CI.InstsToMove);
8802cab237bSDimitry Andric
8812cab237bSDimitry Andric MachineBasicBlock::iterator Next = std::next(CI.I);
8822cab237bSDimitry Andric CI.I->eraseFromParent();
8832cab237bSDimitry Andric CI.Paired->eraseFromParent();
8842cab237bSDimitry Andric return Next;
8852cab237bSDimitry Andric }
8862cab237bSDimitry Andric
887*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo & CI)888*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
8892cab237bSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
8902cab237bSDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
8912cab237bSDimitry Andric
892*b5893f02SDimitry Andric const unsigned Opcode = getNewOpcode(CI);
8932cab237bSDimitry Andric
894*b5893f02SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
895*b5893f02SDimitry Andric
896*b5893f02SDimitry Andric // Copy to the new source register.
8972cab237bSDimitry Andric unsigned DestReg = MRI->createVirtualRegister(SuperRC);
8982cab237bSDimitry Andric unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
8992cab237bSDimitry Andric
9002cab237bSDimitry Andric auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
9012cab237bSDimitry Andric
902*b5893f02SDimitry Andric const unsigned Regs = getRegs(Opcode);
903*b5893f02SDimitry Andric
904*b5893f02SDimitry Andric if (Regs & VADDR)
9052cab237bSDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
9062cab237bSDimitry Andric
9072cab237bSDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
9082cab237bSDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
9092cab237bSDimitry Andric .addImm(MergedOffset) // offset
9102cab237bSDimitry Andric .addImm(CI.GLC0) // glc
9112cab237bSDimitry Andric .addImm(CI.SLC0) // slc
9122cab237bSDimitry Andric .addImm(0) // tfe
913*b5893f02SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
9142cab237bSDimitry Andric
915*b5893f02SDimitry Andric std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
916*b5893f02SDimitry Andric const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
917*b5893f02SDimitry Andric const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
9182cab237bSDimitry Andric
9192cab237bSDimitry Andric // Copy to the old destination registers.
9202cab237bSDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
9212cab237bSDimitry Andric const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
9222cab237bSDimitry Andric const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
9232cab237bSDimitry Andric
9242cab237bSDimitry Andric BuildMI(*MBB, CI.Paired, DL, CopyDesc)
9252cab237bSDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg.
9262cab237bSDimitry Andric .addReg(DestReg, 0, SubRegIdx0);
9272cab237bSDimitry Andric MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
9282cab237bSDimitry Andric .add(*Dest1)
9292cab237bSDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1);
9302cab237bSDimitry Andric
9312cab237bSDimitry Andric moveInstsAfter(Copy1, CI.InstsToMove);
9322cab237bSDimitry Andric
9332cab237bSDimitry Andric MachineBasicBlock::iterator Next = std::next(CI.I);
9342cab237bSDimitry Andric CI.I->eraseFromParent();
9352cab237bSDimitry Andric CI.Paired->eraseFromParent();
9362cab237bSDimitry Andric return Next;
9372cab237bSDimitry Andric }
9382cab237bSDimitry Andric
getNewOpcode(const CombineInfo & CI)939*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
940*b5893f02SDimitry Andric const unsigned Width = CI.Width0 + CI.Width1;
9412cab237bSDimitry Andric
942*b5893f02SDimitry Andric switch (CI.InstClass) {
943*b5893f02SDimitry Andric default:
944*b5893f02SDimitry Andric return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
945*b5893f02SDimitry Andric case UNKNOWN:
946*b5893f02SDimitry Andric llvm_unreachable("Unknown instruction class");
947*b5893f02SDimitry Andric case S_BUFFER_LOAD_IMM:
948*b5893f02SDimitry Andric switch (Width) {
949*b5893f02SDimitry Andric default:
9502cab237bSDimitry Andric return 0;
951*b5893f02SDimitry Andric case 2:
952*b5893f02SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
953*b5893f02SDimitry Andric case 4:
954*b5893f02SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
955*b5893f02SDimitry Andric }
956*b5893f02SDimitry Andric }
9572cab237bSDimitry Andric }
9582cab237bSDimitry Andric
959*b5893f02SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI)960*b5893f02SDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
961*b5893f02SDimitry Andric if (CI.Offset0 > CI.Offset1) {
962*b5893f02SDimitry Andric switch (CI.Width0) {
963*b5893f02SDimitry Andric default:
964*b5893f02SDimitry Andric return std::make_pair(0, 0);
965*b5893f02SDimitry Andric case 1:
966*b5893f02SDimitry Andric switch (CI.Width1) {
967*b5893f02SDimitry Andric default:
968*b5893f02SDimitry Andric return std::make_pair(0, 0);
969*b5893f02SDimitry Andric case 1:
970*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
971*b5893f02SDimitry Andric case 2:
972*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
973*b5893f02SDimitry Andric case 3:
974*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
975*b5893f02SDimitry Andric }
976*b5893f02SDimitry Andric case 2:
977*b5893f02SDimitry Andric switch (CI.Width1) {
978*b5893f02SDimitry Andric default:
979*b5893f02SDimitry Andric return std::make_pair(0, 0);
980*b5893f02SDimitry Andric case 1:
981*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
982*b5893f02SDimitry Andric case 2:
983*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
984*b5893f02SDimitry Andric }
985*b5893f02SDimitry Andric case 3:
986*b5893f02SDimitry Andric switch (CI.Width1) {
987*b5893f02SDimitry Andric default:
988*b5893f02SDimitry Andric return std::make_pair(0, 0);
989*b5893f02SDimitry Andric case 1:
990*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
991*b5893f02SDimitry Andric }
992*b5893f02SDimitry Andric }
993*b5893f02SDimitry Andric } else {
994*b5893f02SDimitry Andric switch (CI.Width0) {
995*b5893f02SDimitry Andric default:
996*b5893f02SDimitry Andric return std::make_pair(0, 0);
997*b5893f02SDimitry Andric case 1:
998*b5893f02SDimitry Andric switch (CI.Width1) {
999*b5893f02SDimitry Andric default:
1000*b5893f02SDimitry Andric return std::make_pair(0, 0);
1001*b5893f02SDimitry Andric case 1:
1002*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1003*b5893f02SDimitry Andric case 2:
1004*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1005*b5893f02SDimitry Andric case 3:
1006*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1007*b5893f02SDimitry Andric }
1008*b5893f02SDimitry Andric case 2:
1009*b5893f02SDimitry Andric switch (CI.Width1) {
1010*b5893f02SDimitry Andric default:
1011*b5893f02SDimitry Andric return std::make_pair(0, 0);
1012*b5893f02SDimitry Andric case 1:
1013*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1014*b5893f02SDimitry Andric case 2:
1015*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1016*b5893f02SDimitry Andric }
1017*b5893f02SDimitry Andric case 3:
1018*b5893f02SDimitry Andric switch (CI.Width1) {
1019*b5893f02SDimitry Andric default:
1020*b5893f02SDimitry Andric return std::make_pair(0, 0);
1021*b5893f02SDimitry Andric case 1:
1022*b5893f02SDimitry Andric return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1023*b5893f02SDimitry Andric }
1024*b5893f02SDimitry Andric }
1025*b5893f02SDimitry Andric }
1026*b5893f02SDimitry Andric }
1027*b5893f02SDimitry Andric
1028*b5893f02SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI)1029*b5893f02SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1030*b5893f02SDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1031*b5893f02SDimitry Andric switch (CI.Width0 + CI.Width1) {
1032*b5893f02SDimitry Andric default:
1033*b5893f02SDimitry Andric return nullptr;
1034*b5893f02SDimitry Andric case 2:
1035*b5893f02SDimitry Andric return &AMDGPU::SReg_64_XEXECRegClass;
1036*b5893f02SDimitry Andric case 4:
1037*b5893f02SDimitry Andric return &AMDGPU::SReg_128RegClass;
1038*b5893f02SDimitry Andric case 8:
1039*b5893f02SDimitry Andric return &AMDGPU::SReg_256RegClass;
1040*b5893f02SDimitry Andric case 16:
1041*b5893f02SDimitry Andric return &AMDGPU::SReg_512RegClass;
1042*b5893f02SDimitry Andric }
1043*b5893f02SDimitry Andric } else {
1044*b5893f02SDimitry Andric switch (CI.Width0 + CI.Width1) {
1045*b5893f02SDimitry Andric default:
1046*b5893f02SDimitry Andric return nullptr;
1047*b5893f02SDimitry Andric case 2:
1048*b5893f02SDimitry Andric return &AMDGPU::VReg_64RegClass;
1049*b5893f02SDimitry Andric case 3:
1050*b5893f02SDimitry Andric return &AMDGPU::VReg_96RegClass;
1051*b5893f02SDimitry Andric case 4:
1052*b5893f02SDimitry Andric return &AMDGPU::VReg_128RegClass;
1053*b5893f02SDimitry Andric }
1054*b5893f02SDimitry Andric }
1055*b5893f02SDimitry Andric }
1056*b5893f02SDimitry Andric
1057*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo & CI)1058*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
10592cab237bSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
10602cab237bSDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
10612cab237bSDimitry Andric
1062*b5893f02SDimitry Andric const unsigned Opcode = getNewOpcode(CI);
10632cab237bSDimitry Andric
1064*b5893f02SDimitry Andric std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1065*b5893f02SDimitry Andric const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1066*b5893f02SDimitry Andric const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
10672cab237bSDimitry Andric
10682cab237bSDimitry Andric // Copy to the new source register.
1069*b5893f02SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
10702cab237bSDimitry Andric unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
10712cab237bSDimitry Andric
10722cab237bSDimitry Andric const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
10732cab237bSDimitry Andric const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
10742cab237bSDimitry Andric
10752cab237bSDimitry Andric BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
10762cab237bSDimitry Andric .add(*Src0)
10772cab237bSDimitry Andric .addImm(SubRegIdx0)
10782cab237bSDimitry Andric .add(*Src1)
10792cab237bSDimitry Andric .addImm(SubRegIdx1);
10802cab237bSDimitry Andric
10812cab237bSDimitry Andric auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
10822cab237bSDimitry Andric .addReg(SrcReg, RegState::Kill);
10832cab237bSDimitry Andric
1084*b5893f02SDimitry Andric const unsigned Regs = getRegs(Opcode);
1085*b5893f02SDimitry Andric
1086*b5893f02SDimitry Andric if (Regs & VADDR)
10872cab237bSDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
10882cab237bSDimitry Andric
10892cab237bSDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
10902cab237bSDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
10912cab237bSDimitry Andric .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
10922cab237bSDimitry Andric .addImm(CI.GLC0) // glc
10932cab237bSDimitry Andric .addImm(CI.SLC0) // slc
10942cab237bSDimitry Andric .addImm(0) // tfe
1095*b5893f02SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
10962cab237bSDimitry Andric
10972cab237bSDimitry Andric moveInstsAfter(MIB, CI.InstsToMove);
10982cab237bSDimitry Andric
10992cab237bSDimitry Andric MachineBasicBlock::iterator Next = std::next(CI.I);
11002cab237bSDimitry Andric CI.I->eraseFromParent();
11012cab237bSDimitry Andric CI.Paired->eraseFromParent();
11022cab237bSDimitry Andric return Next;
11032cab237bSDimitry Andric }
11042cab237bSDimitry Andric
1105*b5893f02SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI)1106*b5893f02SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1107*b5893f02SDimitry Andric APInt V(32, Val, true);
1108*b5893f02SDimitry Andric if (TII->isInlineConstant(V))
1109*b5893f02SDimitry Andric return MachineOperand::CreateImm(Val);
1110*b5893f02SDimitry Andric
1111*b5893f02SDimitry Andric unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1112*b5893f02SDimitry Andric MachineInstr *Mov =
1113*b5893f02SDimitry Andric BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1114*b5893f02SDimitry Andric TII->get(AMDGPU::S_MOV_B32), Reg)
1115*b5893f02SDimitry Andric .addImm(Val);
1116*b5893f02SDimitry Andric (void)Mov;
1117*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " "; Mov->dump());
1118*b5893f02SDimitry Andric return MachineOperand::CreateReg(Reg, false);
1119*b5893f02SDimitry Andric }
1120*b5893f02SDimitry Andric
1121*b5893f02SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr)1122*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1123*b5893f02SDimitry Andric const MemAddress &Addr) {
1124*b5893f02SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
1125*b5893f02SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
1126*b5893f02SDimitry Andric DebugLoc DL = MI.getDebugLoc();
1127*b5893f02SDimitry Andric
1128*b5893f02SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1129*b5893f02SDimitry Andric Addr.Base.LoSubReg) &&
1130*b5893f02SDimitry Andric "Expected 32-bit Base-Register-Low!!");
1131*b5893f02SDimitry Andric
1132*b5893f02SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1133*b5893f02SDimitry Andric Addr.Base.HiSubReg) &&
1134*b5893f02SDimitry Andric "Expected 32-bit Base-Register-Hi!!");
1135*b5893f02SDimitry Andric
1136*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1137*b5893f02SDimitry Andric MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1138*b5893f02SDimitry Andric MachineOperand OffsetHi =
1139*b5893f02SDimitry Andric createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1140*b5893f02SDimitry Andric unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1141*b5893f02SDimitry Andric unsigned DeadCarryReg =
1142*b5893f02SDimitry Andric MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1143*b5893f02SDimitry Andric
1144*b5893f02SDimitry Andric unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145*b5893f02SDimitry Andric unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1146*b5893f02SDimitry Andric MachineInstr *LoHalf =
1147*b5893f02SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1148*b5893f02SDimitry Andric .addReg(CarryReg, RegState::Define)
1149*b5893f02SDimitry Andric .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1150*b5893f02SDimitry Andric .add(OffsetLo);
1151*b5893f02SDimitry Andric (void)LoHalf;
1152*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1153*b5893f02SDimitry Andric
1154*b5893f02SDimitry Andric MachineInstr *HiHalf =
1155*b5893f02SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1156*b5893f02SDimitry Andric .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1157*b5893f02SDimitry Andric .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1158*b5893f02SDimitry Andric .add(OffsetHi)
1159*b5893f02SDimitry Andric .addReg(CarryReg, RegState::Kill);
1160*b5893f02SDimitry Andric (void)HiHalf;
1161*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1162*b5893f02SDimitry Andric
1163*b5893f02SDimitry Andric unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1164*b5893f02SDimitry Andric MachineInstr *FullBase =
1165*b5893f02SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1166*b5893f02SDimitry Andric .addReg(DestSub0)
1167*b5893f02SDimitry Andric .addImm(AMDGPU::sub0)
1168*b5893f02SDimitry Andric .addReg(DestSub1)
1169*b5893f02SDimitry Andric .addImm(AMDGPU::sub1);
1170*b5893f02SDimitry Andric (void)FullBase;
1171*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1172*b5893f02SDimitry Andric
1173*b5893f02SDimitry Andric return FullDestReg;
1174*b5893f02SDimitry Andric }
1175*b5893f02SDimitry Andric
1176*b5893f02SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,unsigned NewBase,int32_t NewOffset)1177*b5893f02SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1178*b5893f02SDimitry Andric unsigned NewBase,
1179*b5893f02SDimitry Andric int32_t NewOffset) {
1180*b5893f02SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1181*b5893f02SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1182*b5893f02SDimitry Andric }
1183*b5893f02SDimitry Andric
1184*b5893f02SDimitry Andric Optional<int32_t>
extractConstOffset(const MachineOperand & Op)1185*b5893f02SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1186*b5893f02SDimitry Andric if (Op.isImm())
1187*b5893f02SDimitry Andric return Op.getImm();
1188*b5893f02SDimitry Andric
1189*b5893f02SDimitry Andric if (!Op.isReg())
1190*b5893f02SDimitry Andric return None;
1191*b5893f02SDimitry Andric
1192*b5893f02SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1193*b5893f02SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1194*b5893f02SDimitry Andric !Def->getOperand(1).isImm())
1195*b5893f02SDimitry Andric return None;
1196*b5893f02SDimitry Andric
1197*b5893f02SDimitry Andric return Def->getOperand(1).getImm();
1198*b5893f02SDimitry Andric }
1199*b5893f02SDimitry Andric
1200*b5893f02SDimitry Andric // Analyze Base and extracts:
1201*b5893f02SDimitry Andric // - 32bit base registers, subregisters
1202*b5893f02SDimitry Andric // - 64bit constant offset
1203*b5893f02SDimitry Andric // Expecting base computation as:
1204*b5893f02SDimitry Andric // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1205*b5893f02SDimitry Andric // %LO:vgpr_32, %c:sreg_64_xexec =
1206*b5893f02SDimitry Andric // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1207*b5893f02SDimitry Andric // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1208*b5893f02SDimitry Andric // %Base:vreg_64 =
1209*b5893f02SDimitry Andric // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr)1210*b5893f02SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1211*b5893f02SDimitry Andric MemAddress &Addr) {
1212*b5893f02SDimitry Andric if (!Base.isReg())
1213*b5893f02SDimitry Andric return;
1214*b5893f02SDimitry Andric
1215*b5893f02SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1216*b5893f02SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1217*b5893f02SDimitry Andric || Def->getNumOperands() != 5)
1218*b5893f02SDimitry Andric return;
1219*b5893f02SDimitry Andric
1220*b5893f02SDimitry Andric MachineOperand BaseLo = Def->getOperand(1);
1221*b5893f02SDimitry Andric MachineOperand BaseHi = Def->getOperand(3);
1222*b5893f02SDimitry Andric if (!BaseLo.isReg() || !BaseHi.isReg())
1223*b5893f02SDimitry Andric return;
1224*b5893f02SDimitry Andric
1225*b5893f02SDimitry Andric MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1226*b5893f02SDimitry Andric MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1227*b5893f02SDimitry Andric
1228*b5893f02SDimitry Andric if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1229*b5893f02SDimitry Andric !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1230*b5893f02SDimitry Andric return;
1231*b5893f02SDimitry Andric
1232*b5893f02SDimitry Andric const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1233*b5893f02SDimitry Andric const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1234*b5893f02SDimitry Andric
1235*b5893f02SDimitry Andric auto Offset0P = extractConstOffset(*Src0);
1236*b5893f02SDimitry Andric if (Offset0P)
1237*b5893f02SDimitry Andric BaseLo = *Src1;
1238*b5893f02SDimitry Andric else {
1239*b5893f02SDimitry Andric if (!(Offset0P = extractConstOffset(*Src1)))
1240*b5893f02SDimitry Andric return;
1241*b5893f02SDimitry Andric BaseLo = *Src0;
1242*b5893f02SDimitry Andric }
1243*b5893f02SDimitry Andric
1244*b5893f02SDimitry Andric Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1245*b5893f02SDimitry Andric Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1246*b5893f02SDimitry Andric
1247*b5893f02SDimitry Andric if (Src0->isImm())
1248*b5893f02SDimitry Andric std::swap(Src0, Src1);
1249*b5893f02SDimitry Andric
1250*b5893f02SDimitry Andric if (!Src1->isImm())
1251*b5893f02SDimitry Andric return;
1252*b5893f02SDimitry Andric
1253*b5893f02SDimitry Andric uint64_t Offset1 = Src1->getImm();
1254*b5893f02SDimitry Andric BaseHi = *Src0;
1255*b5893f02SDimitry Andric
1256*b5893f02SDimitry Andric Addr.Base.LoReg = BaseLo.getReg();
1257*b5893f02SDimitry Andric Addr.Base.HiReg = BaseHi.getReg();
1258*b5893f02SDimitry Andric Addr.Base.LoSubReg = BaseLo.getSubReg();
1259*b5893f02SDimitry Andric Addr.Base.HiSubReg = BaseHi.getSubReg();
1260*b5893f02SDimitry Andric Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1261*b5893f02SDimitry Andric }
1262*b5893f02SDimitry Andric
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList)1263*b5893f02SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1264*b5893f02SDimitry Andric MachineInstr &MI,
1265*b5893f02SDimitry Andric MemInfoMap &Visited,
1266*b5893f02SDimitry Andric SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1267*b5893f02SDimitry Andric
1268*b5893f02SDimitry Andric // TODO: Support flat and scratch.
1269*b5893f02SDimitry Andric if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1270*b5893f02SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1271*b5893f02SDimitry Andric return false;
1272*b5893f02SDimitry Andric
1273*b5893f02SDimitry Andric // TODO: Support Store.
1274*b5893f02SDimitry Andric if (!MI.mayLoad())
1275*b5893f02SDimitry Andric return false;
1276*b5893f02SDimitry Andric
1277*b5893f02SDimitry Andric if (AnchorList.count(&MI))
1278*b5893f02SDimitry Andric return false;
1279*b5893f02SDimitry Andric
1280*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1281*b5893f02SDimitry Andric
1282*b5893f02SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1283*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1284*b5893f02SDimitry Andric return false;
1285*b5893f02SDimitry Andric }
1286*b5893f02SDimitry Andric
1287*b5893f02SDimitry Andric // Step1: Find the base-registers and a 64bit constant offset.
1288*b5893f02SDimitry Andric MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1289*b5893f02SDimitry Andric MemAddress MAddr;
1290*b5893f02SDimitry Andric if (Visited.find(&MI) == Visited.end()) {
1291*b5893f02SDimitry Andric processBaseWithConstOffset(Base, MAddr);
1292*b5893f02SDimitry Andric Visited[&MI] = MAddr;
1293*b5893f02SDimitry Andric } else
1294*b5893f02SDimitry Andric MAddr = Visited[&MI];
1295*b5893f02SDimitry Andric
1296*b5893f02SDimitry Andric if (MAddr.Offset == 0) {
1297*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1298*b5893f02SDimitry Andric " constant offsets that can be promoted.\n";);
1299*b5893f02SDimitry Andric return false;
1300*b5893f02SDimitry Andric }
1301*b5893f02SDimitry Andric
1302*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1303*b5893f02SDimitry Andric << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1304*b5893f02SDimitry Andric
1305*b5893f02SDimitry Andric // Step2: Traverse through MI's basic block and find an anchor(that has the
1306*b5893f02SDimitry Andric // same base-registers) with the highest 13bit distance from MI's offset.
1307*b5893f02SDimitry Andric // E.g. (64bit loads)
1308*b5893f02SDimitry Andric // bb:
1309*b5893f02SDimitry Andric // addr1 = &a + 4096; load1 = load(addr1, 0)
1310*b5893f02SDimitry Andric // addr2 = &a + 6144; load2 = load(addr2, 0)
1311*b5893f02SDimitry Andric // addr3 = &a + 8192; load3 = load(addr3, 0)
1312*b5893f02SDimitry Andric // addr4 = &a + 10240; load4 = load(addr4, 0)
1313*b5893f02SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
1314*b5893f02SDimitry Andric //
1315*b5893f02SDimitry Andric // Starting from the first load, the optimization will try to find a new base
1316*b5893f02SDimitry Andric // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1317*b5893f02SDimitry Andric // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1318*b5893f02SDimitry Andric // as the new-base(anchor) because of the maximum distance which can
1319*b5893f02SDimitry Andric // accomodate more intermediate bases presumeably.
1320*b5893f02SDimitry Andric //
1321*b5893f02SDimitry Andric // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1322*b5893f02SDimitry Andric // (&a + 8192) for load1, load2, load4.
1323*b5893f02SDimitry Andric // addr = &a + 8192
1324*b5893f02SDimitry Andric // load1 = load(addr, -4096)
1325*b5893f02SDimitry Andric // load2 = load(addr, -2048)
1326*b5893f02SDimitry Andric // load3 = load(addr, 0)
1327*b5893f02SDimitry Andric // load4 = load(addr, 2048)
1328*b5893f02SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
1329*b5893f02SDimitry Andric //
1330*b5893f02SDimitry Andric MachineInstr *AnchorInst = nullptr;
1331*b5893f02SDimitry Andric MemAddress AnchorAddr;
1332*b5893f02SDimitry Andric uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1333*b5893f02SDimitry Andric SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1334*b5893f02SDimitry Andric
1335*b5893f02SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
1336*b5893f02SDimitry Andric MachineBasicBlock::iterator E = MBB->end();
1337*b5893f02SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
1338*b5893f02SDimitry Andric ++MBBI;
1339*b5893f02SDimitry Andric const SITargetLowering *TLI =
1340*b5893f02SDimitry Andric static_cast<const SITargetLowering *>(STM->getTargetLowering());
1341*b5893f02SDimitry Andric
1342*b5893f02SDimitry Andric for ( ; MBBI != E; ++MBBI) {
1343*b5893f02SDimitry Andric MachineInstr &MINext = *MBBI;
1344*b5893f02SDimitry Andric // TODO: Support finding an anchor(with same base) from store addresses or
1345*b5893f02SDimitry Andric // any other load addresses where the opcodes are different.
1346*b5893f02SDimitry Andric if (MINext.getOpcode() != MI.getOpcode() ||
1347*b5893f02SDimitry Andric TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1348*b5893f02SDimitry Andric continue;
1349*b5893f02SDimitry Andric
1350*b5893f02SDimitry Andric const MachineOperand &BaseNext =
1351*b5893f02SDimitry Andric *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1352*b5893f02SDimitry Andric MemAddress MAddrNext;
1353*b5893f02SDimitry Andric if (Visited.find(&MINext) == Visited.end()) {
1354*b5893f02SDimitry Andric processBaseWithConstOffset(BaseNext, MAddrNext);
1355*b5893f02SDimitry Andric Visited[&MINext] = MAddrNext;
1356*b5893f02SDimitry Andric } else
1357*b5893f02SDimitry Andric MAddrNext = Visited[&MINext];
1358*b5893f02SDimitry Andric
1359*b5893f02SDimitry Andric if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1360*b5893f02SDimitry Andric MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1361*b5893f02SDimitry Andric MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1362*b5893f02SDimitry Andric MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1363*b5893f02SDimitry Andric continue;
1364*b5893f02SDimitry Andric
1365*b5893f02SDimitry Andric InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1366*b5893f02SDimitry Andric
1367*b5893f02SDimitry Andric int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1368*b5893f02SDimitry Andric TargetLoweringBase::AddrMode AM;
1369*b5893f02SDimitry Andric AM.HasBaseReg = true;
1370*b5893f02SDimitry Andric AM.BaseOffs = Dist;
1371*b5893f02SDimitry Andric if (TLI->isLegalGlobalAddressingMode(AM) &&
1372*b5893f02SDimitry Andric (uint32_t)std::abs(Dist) > MaxDist) {
1373*b5893f02SDimitry Andric MaxDist = std::abs(Dist);
1374*b5893f02SDimitry Andric
1375*b5893f02SDimitry Andric AnchorAddr = MAddrNext;
1376*b5893f02SDimitry Andric AnchorInst = &MINext;
1377*b5893f02SDimitry Andric }
1378*b5893f02SDimitry Andric }
1379*b5893f02SDimitry Andric
1380*b5893f02SDimitry Andric if (AnchorInst) {
1381*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1382*b5893f02SDimitry Andric AnchorInst->dump());
1383*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1384*b5893f02SDimitry Andric << AnchorAddr.Offset << "\n\n");
1385*b5893f02SDimitry Andric
1386*b5893f02SDimitry Andric // Instead of moving up, just re-compute anchor-instruction's base address.
1387*b5893f02SDimitry Andric unsigned Base = computeBase(MI, AnchorAddr);
1388*b5893f02SDimitry Andric
1389*b5893f02SDimitry Andric updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1390*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1391*b5893f02SDimitry Andric
1392*b5893f02SDimitry Andric for (auto P : InstsWCommonBase) {
1393*b5893f02SDimitry Andric TargetLoweringBase::AddrMode AM;
1394*b5893f02SDimitry Andric AM.HasBaseReg = true;
1395*b5893f02SDimitry Andric AM.BaseOffs = P.second - AnchorAddr.Offset;
1396*b5893f02SDimitry Andric
1397*b5893f02SDimitry Andric if (TLI->isLegalGlobalAddressingMode(AM)) {
1398*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1399*b5893f02SDimitry Andric dbgs() << ")"; P.first->dump());
1400*b5893f02SDimitry Andric updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1401*b5893f02SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1402*b5893f02SDimitry Andric }
1403*b5893f02SDimitry Andric }
1404*b5893f02SDimitry Andric AnchorList.insert(AnchorInst);
1405*b5893f02SDimitry Andric return true;
1406*b5893f02SDimitry Andric }
1407*b5893f02SDimitry Andric
1408*b5893f02SDimitry Andric return false;
1409*b5893f02SDimitry Andric }
1410*b5893f02SDimitry Andric
14118f0fd8f6SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
14128f0fd8f6SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
14138f0fd8f6SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(MachineBasicBlock & MBB)14148f0fd8f6SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
14158f0fd8f6SDimitry Andric bool Modified = false;
14168f0fd8f6SDimitry Andric
1417*b5893f02SDimitry Andric // Contain the list
1418*b5893f02SDimitry Andric MemInfoMap Visited;
1419*b5893f02SDimitry Andric // Contains the list of instructions for which constant offsets are being
1420*b5893f02SDimitry Andric // promoted to the IMM.
1421*b5893f02SDimitry Andric SmallPtrSet<MachineInstr *, 4> AnchorList;
1422*b5893f02SDimitry Andric
14238f0fd8f6SDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
14248f0fd8f6SDimitry Andric MachineInstr &MI = *I;
14258f0fd8f6SDimitry Andric
1426*b5893f02SDimitry Andric if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1427*b5893f02SDimitry Andric Modified = true;
1428*b5893f02SDimitry Andric
14298f0fd8f6SDimitry Andric // Don't combine if volatile.
14308f0fd8f6SDimitry Andric if (MI.hasOrderedMemoryRef()) {
14318f0fd8f6SDimitry Andric ++I;
14328f0fd8f6SDimitry Andric continue;
14338f0fd8f6SDimitry Andric }
14348f0fd8f6SDimitry Andric
1435*b5893f02SDimitry Andric const unsigned Opc = MI.getOpcode();
1436*b5893f02SDimitry Andric
14377a7e6055SDimitry Andric CombineInfo CI;
14387a7e6055SDimitry Andric CI.I = I;
1439*b5893f02SDimitry Andric CI.InstClass = getInstClass(Opc);
14402cab237bSDimitry Andric
1441*b5893f02SDimitry Andric switch (CI.InstClass) {
1442*b5893f02SDimitry Andric default:
1443*b5893f02SDimitry Andric break;
1444*b5893f02SDimitry Andric case DS_READ:
14452cab237bSDimitry Andric CI.EltSize =
1446*b5893f02SDimitry Andric (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1447*b5893f02SDimitry Andric : 4;
14482cab237bSDimitry Andric if (findMatchingInst(CI)) {
14498f0fd8f6SDimitry Andric Modified = true;
14507a7e6055SDimitry Andric I = mergeRead2Pair(CI);
14518f0fd8f6SDimitry Andric } else {
14528f0fd8f6SDimitry Andric ++I;
14538f0fd8f6SDimitry Andric }
14548f0fd8f6SDimitry Andric continue;
1455*b5893f02SDimitry Andric case DS_WRITE:
1456*b5893f02SDimitry Andric CI.EltSize =
1457*b5893f02SDimitry Andric (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1458*b5893f02SDimitry Andric : 4;
14592cab237bSDimitry Andric if (findMatchingInst(CI)) {
14608f0fd8f6SDimitry Andric Modified = true;
14617a7e6055SDimitry Andric I = mergeWrite2Pair(CI);
14628f0fd8f6SDimitry Andric } else {
14638f0fd8f6SDimitry Andric ++I;
14648f0fd8f6SDimitry Andric }
14658f0fd8f6SDimitry Andric continue;
1466*b5893f02SDimitry Andric case S_BUFFER_LOAD_IMM:
14672cab237bSDimitry Andric CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
14682cab237bSDimitry Andric if (findMatchingInst(CI)) {
14692cab237bSDimitry Andric Modified = true;
14702cab237bSDimitry Andric I = mergeSBufferLoadImmPair(CI);
1471*b5893f02SDimitry Andric OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
14722cab237bSDimitry Andric } else {
14732cab237bSDimitry Andric ++I;
14742cab237bSDimitry Andric }
14752cab237bSDimitry Andric continue;
1476*b5893f02SDimitry Andric case BUFFER_LOAD_OFFEN:
1477*b5893f02SDimitry Andric case BUFFER_LOAD_OFFSET:
1478*b5893f02SDimitry Andric case BUFFER_LOAD_OFFEN_exact:
1479*b5893f02SDimitry Andric case BUFFER_LOAD_OFFSET_exact:
14802cab237bSDimitry Andric CI.EltSize = 4;
14812cab237bSDimitry Andric if (findMatchingInst(CI)) {
14822cab237bSDimitry Andric Modified = true;
14832cab237bSDimitry Andric I = mergeBufferLoadPair(CI);
1484*b5893f02SDimitry Andric OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
14852cab237bSDimitry Andric } else {
14862cab237bSDimitry Andric ++I;
14872cab237bSDimitry Andric }
14882cab237bSDimitry Andric continue;
1489*b5893f02SDimitry Andric case BUFFER_STORE_OFFEN:
1490*b5893f02SDimitry Andric case BUFFER_STORE_OFFSET:
1491*b5893f02SDimitry Andric case BUFFER_STORE_OFFEN_exact:
1492*b5893f02SDimitry Andric case BUFFER_STORE_OFFSET_exact:
14932cab237bSDimitry Andric CI.EltSize = 4;
14942cab237bSDimitry Andric if (findMatchingInst(CI)) {
14952cab237bSDimitry Andric Modified = true;
14962cab237bSDimitry Andric I = mergeBufferStorePair(CI);
1497*b5893f02SDimitry Andric OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
14982cab237bSDimitry Andric } else {
14992cab237bSDimitry Andric ++I;
15002cab237bSDimitry Andric }
15012cab237bSDimitry Andric continue;
15022cab237bSDimitry Andric }
15038f0fd8f6SDimitry Andric
15048f0fd8f6SDimitry Andric ++I;
15058f0fd8f6SDimitry Andric }
15068f0fd8f6SDimitry Andric
15078f0fd8f6SDimitry Andric return Modified;
15088f0fd8f6SDimitry Andric }
15098f0fd8f6SDimitry Andric
runOnMachineFunction(MachineFunction & MF)15108f0fd8f6SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
15112cab237bSDimitry Andric if (skipFunction(MF.getFunction()))
15123ca95b02SDimitry Andric return false;
15133ca95b02SDimitry Andric
15144ba319b5SDimitry Andric STM = &MF.getSubtarget<GCNSubtarget>();
15152cab237bSDimitry Andric if (!STM->loadStoreOptEnabled())
15163ca95b02SDimitry Andric return false;
15173ca95b02SDimitry Andric
15182cab237bSDimitry Andric TII = STM->getInstrInfo();
15193ca95b02SDimitry Andric TRI = &TII->getRegisterInfo();
15203ca95b02SDimitry Andric
15218f0fd8f6SDimitry Andric MRI = &MF.getRegInfo();
1522d88c1a5aSDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
15238f0fd8f6SDimitry Andric
15242cab237bSDimitry Andric assert(MRI->isSSA() && "Must be run on SSA");
15252cab237bSDimitry Andric
15264ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
15278f0fd8f6SDimitry Andric
15288f0fd8f6SDimitry Andric bool Modified = false;
15298f0fd8f6SDimitry Andric
15302cab237bSDimitry Andric for (MachineBasicBlock &MBB : MF) {
1531*b5893f02SDimitry Andric do {
1532*b5893f02SDimitry Andric OptimizeAgain = false;
15338f0fd8f6SDimitry Andric Modified |= optimizeBlock(MBB);
1534*b5893f02SDimitry Andric } while (OptimizeAgain);
15352cab237bSDimitry Andric }
15362cab237bSDimitry Andric
15378f0fd8f6SDimitry Andric return Modified;
15388f0fd8f6SDimitry Andric }
1539