18f0fd8f6SDimitry Andric //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
28f0fd8f6SDimitry Andric //
38f0fd8f6SDimitry Andric // The LLVM Compiler Infrastructure
48f0fd8f6SDimitry Andric //
58f0fd8f6SDimitry Andric // This file is distributed under the University of Illinois Open Source
68f0fd8f6SDimitry Andric // License. See LICENSE.TXT for details.
78f0fd8f6SDimitry Andric //
88f0fd8f6SDimitry Andric /// \file
98f0fd8f6SDimitry Andric //===----------------------------------------------------------------------===//
108f0fd8f6SDimitry Andric //
118f0fd8f6SDimitry Andric
128f0fd8f6SDimitry Andric #include "AMDGPU.h"
138f0fd8f6SDimitry Andric #include "AMDGPUSubtarget.h"
148f0fd8f6SDimitry Andric #include "SIInstrInfo.h"
157a7e6055SDimitry Andric #include "SIMachineFunctionInfo.h"
164ba319b5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17edd7eaddSDimitry Andric #include "llvm/ADT/DepthFirstIterator.h"
182cab237bSDimitry Andric #include "llvm/CodeGen/LiveIntervals.h"
198f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
208f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
218f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
228f0fd8f6SDimitry Andric #include "llvm/Support/Debug.h"
238f0fd8f6SDimitry Andric #include "llvm/Support/raw_ostream.h"
248f0fd8f6SDimitry Andric #include "llvm/Target/TargetMachine.h"
258f0fd8f6SDimitry Andric
268f0fd8f6SDimitry Andric #define DEBUG_TYPE "si-fold-operands"
278f0fd8f6SDimitry Andric using namespace llvm;
288f0fd8f6SDimitry Andric
298f0fd8f6SDimitry Andric namespace {
308f0fd8f6SDimitry Andric
318f0fd8f6SDimitry Andric struct FoldCandidate {
328f0fd8f6SDimitry Andric MachineInstr *UseMI;
33d88c1a5aSDimitry Andric union {
348f0fd8f6SDimitry Andric MachineOperand *OpToFold;
358f0fd8f6SDimitry Andric uint64_t ImmToFold;
36d88c1a5aSDimitry Andric int FrameIndexToFold;
37d88c1a5aSDimitry Andric };
38b5893f02SDimitry Andric int ShrinkOpcode;
39d88c1a5aSDimitry Andric unsigned char UseOpNo;
40d88c1a5aSDimitry Andric MachineOperand::MachineOperandType Kind;
416d97bb29SDimitry Andric bool Commuted;
428f0fd8f6SDimitry Andric
FoldCandidate__anon5a94fe150111::FoldCandidate436d97bb29SDimitry Andric FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
44b5893f02SDimitry Andric bool Commuted_ = false,
45b5893f02SDimitry Andric int ShrinkOp = -1) :
46b5893f02SDimitry Andric UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
47b5893f02SDimitry Andric Kind(FoldOp->getType()),
486d97bb29SDimitry Andric Commuted(Commuted_) {
498f0fd8f6SDimitry Andric if (FoldOp->isImm()) {
508f0fd8f6SDimitry Andric ImmToFold = FoldOp->getImm();
51d88c1a5aSDimitry Andric } else if (FoldOp->isFI()) {
52d88c1a5aSDimitry Andric FrameIndexToFold = FoldOp->getIndex();
538f0fd8f6SDimitry Andric } else {
548f0fd8f6SDimitry Andric assert(FoldOp->isReg());
558f0fd8f6SDimitry Andric OpToFold = FoldOp;
568f0fd8f6SDimitry Andric }
578f0fd8f6SDimitry Andric }
588f0fd8f6SDimitry Andric
isFI__anon5a94fe150111::FoldCandidate59d88c1a5aSDimitry Andric bool isFI() const {
60d88c1a5aSDimitry Andric return Kind == MachineOperand::MO_FrameIndex;
61d88c1a5aSDimitry Andric }
62d88c1a5aSDimitry Andric
isImm__anon5a94fe150111::FoldCandidate638f0fd8f6SDimitry Andric bool isImm() const {
64d88c1a5aSDimitry Andric return Kind == MachineOperand::MO_Immediate;
65d88c1a5aSDimitry Andric }
66d88c1a5aSDimitry Andric
isReg__anon5a94fe150111::FoldCandidate67d88c1a5aSDimitry Andric bool isReg() const {
68d88c1a5aSDimitry Andric return Kind == MachineOperand::MO_Register;
698f0fd8f6SDimitry Andric }
706d97bb29SDimitry Andric
isCommuted__anon5a94fe150111::FoldCandidate716d97bb29SDimitry Andric bool isCommuted() const {
726d97bb29SDimitry Andric return Commuted;
736d97bb29SDimitry Andric }
74b5893f02SDimitry Andric
needsShrink__anon5a94fe150111::FoldCandidate75b5893f02SDimitry Andric bool needsShrink() const {
76b5893f02SDimitry Andric return ShrinkOpcode != -1;
77b5893f02SDimitry Andric }
78b5893f02SDimitry Andric
getShrinkOpcode__anon5a94fe150111::FoldCandidate79b5893f02SDimitry Andric int getShrinkOpcode() const {
80b5893f02SDimitry Andric return ShrinkOpcode;
81b5893f02SDimitry Andric }
828f0fd8f6SDimitry Andric };
838f0fd8f6SDimitry Andric
84f1a29dd3SDimitry Andric class SIFoldOperands : public MachineFunctionPass {
85f1a29dd3SDimitry Andric public:
86f1a29dd3SDimitry Andric static char ID;
87f1a29dd3SDimitry Andric MachineRegisterInfo *MRI;
88f1a29dd3SDimitry Andric const SIInstrInfo *TII;
89f1a29dd3SDimitry Andric const SIRegisterInfo *TRI;
904ba319b5SDimitry Andric const GCNSubtarget *ST;
91f1a29dd3SDimitry Andric
92f1a29dd3SDimitry Andric void foldOperand(MachineOperand &OpToFold,
93f1a29dd3SDimitry Andric MachineInstr *UseMI,
94f1a29dd3SDimitry Andric unsigned UseOpIdx,
95f1a29dd3SDimitry Andric SmallVectorImpl<FoldCandidate> &FoldList,
96f1a29dd3SDimitry Andric SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
97f1a29dd3SDimitry Andric
98f1a29dd3SDimitry Andric void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
99f1a29dd3SDimitry Andric
1007a7e6055SDimitry Andric const MachineOperand *isClamp(const MachineInstr &MI) const;
1017a7e6055SDimitry Andric bool tryFoldClamp(MachineInstr &MI);
1027a7e6055SDimitry Andric
1037a7e6055SDimitry Andric std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
1047a7e6055SDimitry Andric bool tryFoldOMod(MachineInstr &MI);
1057a7e6055SDimitry Andric
106f1a29dd3SDimitry Andric public:
SIFoldOperands()107f1a29dd3SDimitry Andric SIFoldOperands() : MachineFunctionPass(ID) {
108f1a29dd3SDimitry Andric initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
109f1a29dd3SDimitry Andric }
110f1a29dd3SDimitry Andric
111f1a29dd3SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
112f1a29dd3SDimitry Andric
getPassName() const113f1a29dd3SDimitry Andric StringRef getPassName() const override { return "SI Fold Operands"; }
114f1a29dd3SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const115f1a29dd3SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
116f1a29dd3SDimitry Andric AU.setPreservesCFG();
117f1a29dd3SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
118f1a29dd3SDimitry Andric }
119f1a29dd3SDimitry Andric };
120f1a29dd3SDimitry Andric
1218f0fd8f6SDimitry Andric } // End anonymous namespace.
1228f0fd8f6SDimitry Andric
1233ca95b02SDimitry Andric INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
1248f0fd8f6SDimitry Andric "SI Fold Operands", false, false)
1258f0fd8f6SDimitry Andric
1268f0fd8f6SDimitry Andric char SIFoldOperands::ID = 0;
1278f0fd8f6SDimitry Andric
1288f0fd8f6SDimitry Andric char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
1298f0fd8f6SDimitry Andric
130f1a29dd3SDimitry Andric // Wrapper around isInlineConstant that understands special cases when
131f1a29dd3SDimitry Andric // instruction types are replaced during operand folding.
isInlineConstantIfFolded(const SIInstrInfo * TII,const MachineInstr & UseMI,unsigned OpNo,const MachineOperand & OpToFold)132f1a29dd3SDimitry Andric static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
133f1a29dd3SDimitry Andric const MachineInstr &UseMI,
134f1a29dd3SDimitry Andric unsigned OpNo,
135f1a29dd3SDimitry Andric const MachineOperand &OpToFold) {
136f1a29dd3SDimitry Andric if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
137f1a29dd3SDimitry Andric return true;
138f1a29dd3SDimitry Andric
139f1a29dd3SDimitry Andric unsigned Opc = UseMI.getOpcode();
140f1a29dd3SDimitry Andric switch (Opc) {
141f1a29dd3SDimitry Andric case AMDGPU::V_MAC_F32_e64:
1424ba319b5SDimitry Andric case AMDGPU::V_MAC_F16_e64:
1434ba319b5SDimitry Andric case AMDGPU::V_FMAC_F32_e64: {
144f1a29dd3SDimitry Andric // Special case for mac. Since this is replaced with mad when folded into
145f1a29dd3SDimitry Andric // src2, we need to check the legality for the final instruction.
146f1a29dd3SDimitry Andric int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
147f1a29dd3SDimitry Andric if (static_cast<int>(OpNo) == Src2Idx) {
1484ba319b5SDimitry Andric bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
149f1a29dd3SDimitry Andric bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
1504ba319b5SDimitry Andric
1514ba319b5SDimitry Andric unsigned Opc = IsFMA ?
1524ba319b5SDimitry Andric AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
1534ba319b5SDimitry Andric const MCInstrDesc &MadDesc = TII->get(Opc);
154f1a29dd3SDimitry Andric return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
155f1a29dd3SDimitry Andric }
156c4394386SDimitry Andric return false;
157f1a29dd3SDimitry Andric }
158f1a29dd3SDimitry Andric default:
159f1a29dd3SDimitry Andric return false;
160f1a29dd3SDimitry Andric }
161f1a29dd3SDimitry Andric }
162f1a29dd3SDimitry Andric
createSIFoldOperandsPass()1638f0fd8f6SDimitry Andric FunctionPass *llvm::createSIFoldOperandsPass() {
1648f0fd8f6SDimitry Andric return new SIFoldOperands();
1658f0fd8f6SDimitry Andric }
1668f0fd8f6SDimitry Andric
updateOperand(FoldCandidate & Fold,const SIInstrInfo & TII,const TargetRegisterInfo & TRI)1678f0fd8f6SDimitry Andric static bool updateOperand(FoldCandidate &Fold,
168b5893f02SDimitry Andric const SIInstrInfo &TII,
1698f0fd8f6SDimitry Andric const TargetRegisterInfo &TRI) {
1708f0fd8f6SDimitry Andric MachineInstr *MI = Fold.UseMI;
1718f0fd8f6SDimitry Andric MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
1728f0fd8f6SDimitry Andric assert(Old.isReg());
1738f0fd8f6SDimitry Andric
1748f0fd8f6SDimitry Andric if (Fold.isImm()) {
1754ba319b5SDimitry Andric if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
1764ba319b5SDimitry Andric // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
1774ba319b5SDimitry Andric // already set.
1784ba319b5SDimitry Andric unsigned Opcode = MI->getOpcode();
1794ba319b5SDimitry Andric int OpNo = MI->getOperandNo(&Old);
1804ba319b5SDimitry Andric int ModIdx = -1;
1814ba319b5SDimitry Andric if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
1824ba319b5SDimitry Andric ModIdx = AMDGPU::OpName::src0_modifiers;
1834ba319b5SDimitry Andric else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
1844ba319b5SDimitry Andric ModIdx = AMDGPU::OpName::src1_modifiers;
1854ba319b5SDimitry Andric else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
1864ba319b5SDimitry Andric ModIdx = AMDGPU::OpName::src2_modifiers;
1874ba319b5SDimitry Andric assert(ModIdx != -1);
1884ba319b5SDimitry Andric ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
1894ba319b5SDimitry Andric MachineOperand &Mod = MI->getOperand(ModIdx);
1904ba319b5SDimitry Andric unsigned Val = Mod.getImm();
1914ba319b5SDimitry Andric if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
1924ba319b5SDimitry Andric return false;
1934ba319b5SDimitry Andric // If upper part is all zero we do not need op_sel_hi.
1944ba319b5SDimitry Andric if (!isUInt<16>(Fold.ImmToFold)) {
1954ba319b5SDimitry Andric if (!(Fold.ImmToFold & 0xffff)) {
1964ba319b5SDimitry Andric Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
1974ba319b5SDimitry Andric Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
1984ba319b5SDimitry Andric Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
1994ba319b5SDimitry Andric return true;
2004ba319b5SDimitry Andric }
2014ba319b5SDimitry Andric Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
2024ba319b5SDimitry Andric }
2034ba319b5SDimitry Andric }
204*85573313SDimitry Andric }
205b5893f02SDimitry Andric
206*85573313SDimitry Andric if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
207b5893f02SDimitry Andric MachineBasicBlock *MBB = MI->getParent();
208b5893f02SDimitry Andric auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
209b5893f02SDimitry Andric if (Liveness != MachineBasicBlock::LQR_Dead)
210b5893f02SDimitry Andric return false;
211b5893f02SDimitry Andric
212b5893f02SDimitry Andric MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
213b5893f02SDimitry Andric int Op32 = Fold.getShrinkOpcode();
214b5893f02SDimitry Andric MachineOperand &Dst0 = MI->getOperand(0);
215b5893f02SDimitry Andric MachineOperand &Dst1 = MI->getOperand(1);
216b5893f02SDimitry Andric assert(Dst0.isDef() && Dst1.isDef());
217b5893f02SDimitry Andric
218b5893f02SDimitry Andric bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
219b5893f02SDimitry Andric
220b5893f02SDimitry Andric const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
221b5893f02SDimitry Andric unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
222b5893f02SDimitry Andric
223b5893f02SDimitry Andric MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
224b5893f02SDimitry Andric
225b5893f02SDimitry Andric if (HaveNonDbgCarryUse) {
226b5893f02SDimitry Andric BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
227b5893f02SDimitry Andric .addReg(AMDGPU::VCC, RegState::Kill);
228b5893f02SDimitry Andric }
229b5893f02SDimitry Andric
230b5893f02SDimitry Andric // Keep the old instruction around to avoid breaking iterators, but
231*85573313SDimitry Andric // replace it with a dummy instruction to remove uses.
232*85573313SDimitry Andric //
233*85573313SDimitry Andric // FIXME: We should not invert how this pass looks at operands to avoid
234*85573313SDimitry Andric // this. Should track set of foldable movs instead of looking for uses
235*85573313SDimitry Andric // when looking at a use.
236b5893f02SDimitry Andric Dst0.setReg(NewReg0);
237*85573313SDimitry Andric for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
238*85573313SDimitry Andric MI->RemoveOperand(I);
239*85573313SDimitry Andric MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
240b5893f02SDimitry Andric
241b5893f02SDimitry Andric if (Fold.isCommuted())
242b5893f02SDimitry Andric TII.commuteInstruction(*Inst32, false);
243b5893f02SDimitry Andric return true;
244b5893f02SDimitry Andric }
245b5893f02SDimitry Andric
246*85573313SDimitry Andric assert(!Fold.needsShrink() && "not handled");
247*85573313SDimitry Andric
248*85573313SDimitry Andric if (Fold.isImm()) {
2498f0fd8f6SDimitry Andric Old.ChangeToImmediate(Fold.ImmToFold);
2508f0fd8f6SDimitry Andric return true;
2518f0fd8f6SDimitry Andric }
2528f0fd8f6SDimitry Andric
253d88c1a5aSDimitry Andric if (Fold.isFI()) {
254d88c1a5aSDimitry Andric Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
255d88c1a5aSDimitry Andric return true;
256d88c1a5aSDimitry Andric }
257d88c1a5aSDimitry Andric
2588f0fd8f6SDimitry Andric MachineOperand *New = Fold.OpToFold;
2598f0fd8f6SDimitry Andric if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
2608f0fd8f6SDimitry Andric TargetRegisterInfo::isVirtualRegister(New->getReg())) {
2618f0fd8f6SDimitry Andric Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
262edd7eaddSDimitry Andric
263edd7eaddSDimitry Andric Old.setIsUndef(New->isUndef());
2648f0fd8f6SDimitry Andric return true;
2658f0fd8f6SDimitry Andric }
2668f0fd8f6SDimitry Andric
2678f0fd8f6SDimitry Andric // FIXME: Handle physical registers.
2688f0fd8f6SDimitry Andric
2698f0fd8f6SDimitry Andric return false;
2708f0fd8f6SDimitry Andric }
2718f0fd8f6SDimitry Andric
isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,const MachineInstr * MI)272f1a29dd3SDimitry Andric static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
273875ed548SDimitry Andric const MachineInstr *MI) {
274875ed548SDimitry Andric for (auto Candidate : FoldList) {
275875ed548SDimitry Andric if (Candidate.UseMI == MI)
276875ed548SDimitry Andric return true;
277875ed548SDimitry Andric }
278875ed548SDimitry Andric return false;
279875ed548SDimitry Andric }
280875ed548SDimitry Andric
tryAddToFoldList(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * OpToFold,const SIInstrInfo * TII)281f1a29dd3SDimitry Andric static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
2828f0fd8f6SDimitry Andric MachineInstr *MI, unsigned OpNo,
2838f0fd8f6SDimitry Andric MachineOperand *OpToFold,
2848f0fd8f6SDimitry Andric const SIInstrInfo *TII) {
2853ca95b02SDimitry Andric if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
286875ed548SDimitry Andric
287d88c1a5aSDimitry Andric // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
288875ed548SDimitry Andric unsigned Opc = MI->getOpcode();
2894ba319b5SDimitry Andric if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
2904ba319b5SDimitry Andric Opc == AMDGPU::V_FMAC_F32_e64) &&
291875ed548SDimitry Andric (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
2924ba319b5SDimitry Andric bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
293d88c1a5aSDimitry Andric bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
2944ba319b5SDimitry Andric unsigned NewOpc = IsFMA ?
2954ba319b5SDimitry Andric AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
296d88c1a5aSDimitry Andric
297d88c1a5aSDimitry Andric // Check if changing this to a v_mad_{f16, f32} instruction will allow us
298d88c1a5aSDimitry Andric // to fold the operand.
2994ba319b5SDimitry Andric MI->setDesc(TII->get(NewOpc));
300875ed548SDimitry Andric bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
301875ed548SDimitry Andric if (FoldAsMAD) {
302875ed548SDimitry Andric MI->untieRegOperand(OpNo);
303875ed548SDimitry Andric return true;
304875ed548SDimitry Andric }
305875ed548SDimitry Andric MI->setDesc(TII->get(Opc));
306875ed548SDimitry Andric }
307875ed548SDimitry Andric
308d88c1a5aSDimitry Andric // Special case for s_setreg_b32
309d88c1a5aSDimitry Andric if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
310d88c1a5aSDimitry Andric MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
311d88c1a5aSDimitry Andric FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
312d88c1a5aSDimitry Andric return true;
313d88c1a5aSDimitry Andric }
314d88c1a5aSDimitry Andric
315875ed548SDimitry Andric // If we are already folding into another operand of MI, then
316875ed548SDimitry Andric // we can't commute the instruction, otherwise we risk making the
317875ed548SDimitry Andric // other fold illegal.
318875ed548SDimitry Andric if (isUseMIInFoldList(FoldList, MI))
319875ed548SDimitry Andric return false;
320875ed548SDimitry Andric
321b5893f02SDimitry Andric unsigned CommuteOpNo = OpNo;
322b5893f02SDimitry Andric
3238f0fd8f6SDimitry Andric // Operand is not legal, so try to commute the instruction to
3248f0fd8f6SDimitry Andric // see if this makes it possible to fold.
3257d523365SDimitry Andric unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
3267d523365SDimitry Andric unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3273ca95b02SDimitry Andric bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
3288f0fd8f6SDimitry Andric
3298f0fd8f6SDimitry Andric if (CanCommute) {
3308f0fd8f6SDimitry Andric if (CommuteIdx0 == OpNo)
331b5893f02SDimitry Andric CommuteOpNo = CommuteIdx1;
3328f0fd8f6SDimitry Andric else if (CommuteIdx1 == OpNo)
333b5893f02SDimitry Andric CommuteOpNo = CommuteIdx0;
3348f0fd8f6SDimitry Andric }
3358f0fd8f6SDimitry Andric
336b5893f02SDimitry Andric
3377d523365SDimitry Andric // One of operands might be an Imm operand, and OpNo may refer to it after
3387d523365SDimitry Andric // the call of commuteInstruction() below. Such situations are avoided
3397d523365SDimitry Andric // here explicitly as OpNo must be a register operand to be a candidate
3407d523365SDimitry Andric // for memory folding.
3417d523365SDimitry Andric if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
3427d523365SDimitry Andric !MI->getOperand(CommuteIdx1).isReg()))
3437d523365SDimitry Andric return false;
3447d523365SDimitry Andric
3457d523365SDimitry Andric if (!CanCommute ||
3463ca95b02SDimitry Andric !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
3478f0fd8f6SDimitry Andric return false;
3488f0fd8f6SDimitry Andric
349b5893f02SDimitry Andric if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
350b5893f02SDimitry Andric if ((Opc == AMDGPU::V_ADD_I32_e64 ||
351b5893f02SDimitry Andric Opc == AMDGPU::V_SUB_I32_e64 ||
352b5893f02SDimitry Andric Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
353*85573313SDimitry Andric (OpToFold->isImm() || OpToFold->isFI())) {
354b5893f02SDimitry Andric MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
355b5893f02SDimitry Andric
356b5893f02SDimitry Andric // Verify the other operand is a VGPR, otherwise we would violate the
357b5893f02SDimitry Andric // constant bus restriction.
358b5893f02SDimitry Andric unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
359b5893f02SDimitry Andric MachineOperand &OtherOp = MI->getOperand(OtherIdx);
360b5893f02SDimitry Andric if (!OtherOp.isReg() ||
361b5893f02SDimitry Andric !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
362b5893f02SDimitry Andric return false;
363b5893f02SDimitry Andric
364b5893f02SDimitry Andric assert(MI->getOperand(1).isDef());
365b5893f02SDimitry Andric
366*85573313SDimitry Andric // Make sure to get the 32-bit version of the commuted opcode.
367*85573313SDimitry Andric unsigned MaybeCommutedOpc = MI->getOpcode();
368*85573313SDimitry Andric int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
369*85573313SDimitry Andric
370b5893f02SDimitry Andric FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
371b5893f02SDimitry Andric Op32));
372b5893f02SDimitry Andric return true;
373b5893f02SDimitry Andric }
374b5893f02SDimitry Andric
3756d97bb29SDimitry Andric TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
3768f0fd8f6SDimitry Andric return false;
3778f0fd8f6SDimitry Andric }
3788f0fd8f6SDimitry Andric
379b5893f02SDimitry Andric FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
3806d97bb29SDimitry Andric return true;
3816d97bb29SDimitry Andric }
3826d97bb29SDimitry Andric
3838f0fd8f6SDimitry Andric FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
3848f0fd8f6SDimitry Andric return true;
3858f0fd8f6SDimitry Andric }
3868f0fd8f6SDimitry Andric
387d88c1a5aSDimitry Andric // If the use operand doesn't care about the value, this may be an operand only
388d88c1a5aSDimitry Andric // used for register indexing, in which case it is unsafe to fold.
isUseSafeToFold(const SIInstrInfo * TII,const MachineInstr & MI,const MachineOperand & UseMO)38989cb50c9SDimitry Andric static bool isUseSafeToFold(const SIInstrInfo *TII,
39089cb50c9SDimitry Andric const MachineInstr &MI,
391d88c1a5aSDimitry Andric const MachineOperand &UseMO) {
39289cb50c9SDimitry Andric return !UseMO.isUndef() && !TII->isSDWA(MI);
393d88c1a5aSDimitry Andric //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
394d88c1a5aSDimitry Andric }
395d88c1a5aSDimitry Andric
foldOperand(MachineOperand & OpToFold,MachineInstr * UseMI,unsigned UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList,SmallVectorImpl<MachineInstr * > & CopiesToReplace) const396f1a29dd3SDimitry Andric void SIFoldOperands::foldOperand(
397f1a29dd3SDimitry Andric MachineOperand &OpToFold,
398f1a29dd3SDimitry Andric MachineInstr *UseMI,
3997d523365SDimitry Andric unsigned UseOpIdx,
400f1a29dd3SDimitry Andric SmallVectorImpl<FoldCandidate> &FoldList,
401f1a29dd3SDimitry Andric SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
4027d523365SDimitry Andric const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
4037d523365SDimitry Andric
40489cb50c9SDimitry Andric if (!isUseSafeToFold(TII, *UseMI, UseOp))
405d88c1a5aSDimitry Andric return;
406d88c1a5aSDimitry Andric
4077d523365SDimitry Andric // FIXME: Fold operands with subregs.
408d88c1a5aSDimitry Andric if (UseOp.isReg() && OpToFold.isReg()) {
409d88c1a5aSDimitry Andric if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
410d88c1a5aSDimitry Andric return;
411d88c1a5aSDimitry Andric
412d88c1a5aSDimitry Andric // Don't fold subregister extracts into tied operands, only if it is a full
413d88c1a5aSDimitry Andric // copy since a subregister use tied to a full register def doesn't really
414d88c1a5aSDimitry Andric // make sense. e.g. don't fold:
415d88c1a5aSDimitry Andric //
4162cab237bSDimitry Andric // %1 = COPY %0:sub1
4172cab237bSDimitry Andric // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
418d88c1a5aSDimitry Andric //
419d88c1a5aSDimitry Andric // into
4202cab237bSDimitry Andric // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
421d88c1a5aSDimitry Andric if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
4227d523365SDimitry Andric return;
4237d523365SDimitry Andric }
4247d523365SDimitry Andric
425d88c1a5aSDimitry Andric // Special case for REG_SEQUENCE: We can't fold literals into
426d88c1a5aSDimitry Andric // REG_SEQUENCE instructions, so we have to fold them into the
427d88c1a5aSDimitry Andric // uses of REG_SEQUENCE.
428d88c1a5aSDimitry Andric if (UseMI->isRegSequence()) {
429d88c1a5aSDimitry Andric unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
430d88c1a5aSDimitry Andric unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
431d88c1a5aSDimitry Andric
432d88c1a5aSDimitry Andric for (MachineRegisterInfo::use_iterator
433f1a29dd3SDimitry Andric RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
434d88c1a5aSDimitry Andric RSUse != RSE; ++RSUse) {
435d88c1a5aSDimitry Andric
436d88c1a5aSDimitry Andric MachineInstr *RSUseMI = RSUse->getParent();
437d88c1a5aSDimitry Andric if (RSUse->getSubReg() != RegSeqDstSubReg)
438d88c1a5aSDimitry Andric continue;
439d88c1a5aSDimitry Andric
440d88c1a5aSDimitry Andric foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
441f1a29dd3SDimitry Andric CopiesToReplace);
442d88c1a5aSDimitry Andric }
443d88c1a5aSDimitry Andric
444d88c1a5aSDimitry Andric return;
445d88c1a5aSDimitry Andric }
446d88c1a5aSDimitry Andric
447d88c1a5aSDimitry Andric
4487d523365SDimitry Andric bool FoldingImm = OpToFold.isImm();
4497d523365SDimitry Andric
450d88c1a5aSDimitry Andric if (FoldingImm && UseMI->isCopy()) {
4517d523365SDimitry Andric unsigned DestReg = UseMI->getOperand(0).getReg();
4527d523365SDimitry Andric const TargetRegisterClass *DestRC
4537d523365SDimitry Andric = TargetRegisterInfo::isVirtualRegister(DestReg) ?
454f1a29dd3SDimitry Andric MRI->getRegClass(DestReg) :
455f1a29dd3SDimitry Andric TRI->getPhysRegClass(DestReg);
4567d523365SDimitry Andric
457b5893f02SDimitry Andric unsigned SrcReg = UseMI->getOperand(1).getReg();
458b5893f02SDimitry Andric if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
459b5893f02SDimitry Andric TargetRegisterInfo::isVirtualRegister(SrcReg)) {
460b5893f02SDimitry Andric const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
461b5893f02SDimitry Andric if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
462b5893f02SDimitry Andric MachineRegisterInfo::use_iterator NextUse;
463b5893f02SDimitry Andric SmallVector<FoldCandidate, 4> CopyUses;
464b5893f02SDimitry Andric for (MachineRegisterInfo::use_iterator
465b5893f02SDimitry Andric Use = MRI->use_begin(DestReg), E = MRI->use_end();
466b5893f02SDimitry Andric Use != E; Use = NextUse) {
467b5893f02SDimitry Andric NextUse = std::next(Use);
468b5893f02SDimitry Andric FoldCandidate FC = FoldCandidate(Use->getParent(),
469b5893f02SDimitry Andric Use.getOperandNo(), &UseMI->getOperand(1));
470b5893f02SDimitry Andric CopyUses.push_back(FC);
471b5893f02SDimitry Andric }
472b5893f02SDimitry Andric for (auto & F : CopyUses) {
473b5893f02SDimitry Andric foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
474b5893f02SDimitry Andric FoldList, CopiesToReplace);
475b5893f02SDimitry Andric }
476b5893f02SDimitry Andric }
477b5893f02SDimitry Andric }
478b5893f02SDimitry Andric
479b5893f02SDimitry Andric // In order to fold immediates into copies, we need to change the
480b5893f02SDimitry Andric // copy to a MOV.
481b5893f02SDimitry Andric
4827d523365SDimitry Andric unsigned MovOp = TII->getMovOpcode(DestRC);
4837d523365SDimitry Andric if (MovOp == AMDGPU::COPY)
4847d523365SDimitry Andric return;
4857d523365SDimitry Andric
4867d523365SDimitry Andric UseMI->setDesc(TII->get(MovOp));
4877d523365SDimitry Andric CopiesToReplace.push_back(UseMI);
488d88c1a5aSDimitry Andric } else {
489b5893f02SDimitry Andric if (UseMI->isCopy() && OpToFold.isReg() &&
490b5893f02SDimitry Andric TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
491b5893f02SDimitry Andric TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
492b5893f02SDimitry Andric TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
493b5893f02SDimitry Andric TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
494b5893f02SDimitry Andric !UseMI->getOperand(1).getSubReg()) {
495b5893f02SDimitry Andric UseMI->getOperand(1).setReg(OpToFold.getReg());
496b5893f02SDimitry Andric UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
497b5893f02SDimitry Andric UseMI->getOperand(1).setIsKill(false);
498b5893f02SDimitry Andric CopiesToReplace.push_back(UseMI);
499b5893f02SDimitry Andric OpToFold.setIsKill(false);
500b5893f02SDimitry Andric return;
501b5893f02SDimitry Andric }
502b5893f02SDimitry Andric
5037d523365SDimitry Andric const MCInstrDesc &UseDesc = UseMI->getDesc();
5047d523365SDimitry Andric
5057d523365SDimitry Andric // Don't fold into target independent nodes. Target independent opcodes
5067d523365SDimitry Andric // don't have defined register classes.
5077d523365SDimitry Andric if (UseDesc.isVariadic() ||
5084ba319b5SDimitry Andric UseOp.isImplicit() ||
5097d523365SDimitry Andric UseDesc.OpInfo[UseOpIdx].RegClass == -1)
5107d523365SDimitry Andric return;
5117d523365SDimitry Andric }
5127d523365SDimitry Andric
513d88c1a5aSDimitry Andric if (!FoldingImm) {
5147d523365SDimitry Andric tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
5157d523365SDimitry Andric
5167d523365SDimitry Andric // FIXME: We could try to change the instruction from 64-bit to 32-bit
5177d523365SDimitry Andric // to enable more folding opportunites. The shrink operands pass
5187d523365SDimitry Andric // already does this.
5197d523365SDimitry Andric return;
5207d523365SDimitry Andric }
5217d523365SDimitry Andric
522d88c1a5aSDimitry Andric
523d88c1a5aSDimitry Andric const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
524d88c1a5aSDimitry Andric const TargetRegisterClass *FoldRC =
525f1a29dd3SDimitry Andric TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
526d88c1a5aSDimitry Andric
527d88c1a5aSDimitry Andric
528d88c1a5aSDimitry Andric // Split 64-bit constants into 32-bits for folding.
529d88c1a5aSDimitry Andric if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
530d88c1a5aSDimitry Andric unsigned UseReg = UseOp.getReg();
531d88c1a5aSDimitry Andric const TargetRegisterClass *UseRC
532d88c1a5aSDimitry Andric = TargetRegisterInfo::isVirtualRegister(UseReg) ?
533f1a29dd3SDimitry Andric MRI->getRegClass(UseReg) :
534f1a29dd3SDimitry Andric TRI->getPhysRegClass(UseReg);
535d88c1a5aSDimitry Andric
536d88c1a5aSDimitry Andric if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
537d88c1a5aSDimitry Andric return;
538d88c1a5aSDimitry Andric
5397a7e6055SDimitry Andric APInt Imm(64, OpToFold.getImm());
540d88c1a5aSDimitry Andric if (UseOp.getSubReg() == AMDGPU::sub0) {
541d88c1a5aSDimitry Andric Imm = Imm.getLoBits(32);
542d88c1a5aSDimitry Andric } else {
543d88c1a5aSDimitry Andric assert(UseOp.getSubReg() == AMDGPU::sub1);
544d88c1a5aSDimitry Andric Imm = Imm.getHiBits(32);
545d88c1a5aSDimitry Andric }
546d88c1a5aSDimitry Andric
547d88c1a5aSDimitry Andric MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
548d88c1a5aSDimitry Andric tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
5497a7e6055SDimitry Andric return;
5507a7e6055SDimitry Andric }
5517a7e6055SDimitry Andric
5527a7e6055SDimitry Andric
5537a7e6055SDimitry Andric
5547a7e6055SDimitry Andric tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
555d88c1a5aSDimitry Andric }
556d88c1a5aSDimitry Andric
evalBinaryInstruction(unsigned Opcode,int32_t & Result,uint32_t LHS,uint32_t RHS)557d88c1a5aSDimitry Andric static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
558f1a29dd3SDimitry Andric uint32_t LHS, uint32_t RHS) {
559d88c1a5aSDimitry Andric switch (Opcode) {
560d88c1a5aSDimitry Andric case AMDGPU::V_AND_B32_e64:
561f1a29dd3SDimitry Andric case AMDGPU::V_AND_B32_e32:
562d88c1a5aSDimitry Andric case AMDGPU::S_AND_B32:
563d88c1a5aSDimitry Andric Result = LHS & RHS;
564d88c1a5aSDimitry Andric return true;
565d88c1a5aSDimitry Andric case AMDGPU::V_OR_B32_e64:
566f1a29dd3SDimitry Andric case AMDGPU::V_OR_B32_e32:
567d88c1a5aSDimitry Andric case AMDGPU::S_OR_B32:
568d88c1a5aSDimitry Andric Result = LHS | RHS;
569d88c1a5aSDimitry Andric return true;
570d88c1a5aSDimitry Andric case AMDGPU::V_XOR_B32_e64:
571f1a29dd3SDimitry Andric case AMDGPU::V_XOR_B32_e32:
572d88c1a5aSDimitry Andric case AMDGPU::S_XOR_B32:
573d88c1a5aSDimitry Andric Result = LHS ^ RHS;
574d88c1a5aSDimitry Andric return true;
575f1a29dd3SDimitry Andric case AMDGPU::V_LSHL_B32_e64:
576f1a29dd3SDimitry Andric case AMDGPU::V_LSHL_B32_e32:
577f1a29dd3SDimitry Andric case AMDGPU::S_LSHL_B32:
578f1a29dd3SDimitry Andric // The instruction ignores the high bits for out of bounds shifts.
579f1a29dd3SDimitry Andric Result = LHS << (RHS & 31);
580f1a29dd3SDimitry Andric return true;
581f1a29dd3SDimitry Andric case AMDGPU::V_LSHLREV_B32_e64:
582f1a29dd3SDimitry Andric case AMDGPU::V_LSHLREV_B32_e32:
583f1a29dd3SDimitry Andric Result = RHS << (LHS & 31);
584f1a29dd3SDimitry Andric return true;
585f1a29dd3SDimitry Andric case AMDGPU::V_LSHR_B32_e64:
586f1a29dd3SDimitry Andric case AMDGPU::V_LSHR_B32_e32:
587f1a29dd3SDimitry Andric case AMDGPU::S_LSHR_B32:
588f1a29dd3SDimitry Andric Result = LHS >> (RHS & 31);
589f1a29dd3SDimitry Andric return true;
590f1a29dd3SDimitry Andric case AMDGPU::V_LSHRREV_B32_e64:
591f1a29dd3SDimitry Andric case AMDGPU::V_LSHRREV_B32_e32:
592f1a29dd3SDimitry Andric Result = RHS >> (LHS & 31);
593f1a29dd3SDimitry Andric return true;
594f1a29dd3SDimitry Andric case AMDGPU::V_ASHR_I32_e64:
595f1a29dd3SDimitry Andric case AMDGPU::V_ASHR_I32_e32:
596f1a29dd3SDimitry Andric case AMDGPU::S_ASHR_I32:
597f1a29dd3SDimitry Andric Result = static_cast<int32_t>(LHS) >> (RHS & 31);
598f1a29dd3SDimitry Andric return true;
599f1a29dd3SDimitry Andric case AMDGPU::V_ASHRREV_I32_e64:
600f1a29dd3SDimitry Andric case AMDGPU::V_ASHRREV_I32_e32:
601f1a29dd3SDimitry Andric Result = static_cast<int32_t>(RHS) >> (LHS & 31);
602f1a29dd3SDimitry Andric return true;
603d88c1a5aSDimitry Andric default:
604d88c1a5aSDimitry Andric return false;
605d88c1a5aSDimitry Andric }
606d88c1a5aSDimitry Andric }
607d88c1a5aSDimitry Andric
getMovOpc(bool IsScalar)608d88c1a5aSDimitry Andric static unsigned getMovOpc(bool IsScalar) {
609d88c1a5aSDimitry Andric return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
610d88c1a5aSDimitry Andric }
611d88c1a5aSDimitry Andric
612d88c1a5aSDimitry Andric /// Remove any leftover implicit operands from mutating the instruction. e.g.
613d88c1a5aSDimitry Andric /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
614d88c1a5aSDimitry Andric /// anymore.
stripExtraCopyOperands(MachineInstr & MI)615d88c1a5aSDimitry Andric static void stripExtraCopyOperands(MachineInstr &MI) {
616d88c1a5aSDimitry Andric const MCInstrDesc &Desc = MI.getDesc();
617d88c1a5aSDimitry Andric unsigned NumOps = Desc.getNumOperands() +
618d88c1a5aSDimitry Andric Desc.getNumImplicitUses() +
619d88c1a5aSDimitry Andric Desc.getNumImplicitDefs();
620d88c1a5aSDimitry Andric
621d88c1a5aSDimitry Andric for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
622d88c1a5aSDimitry Andric MI.RemoveOperand(I);
623d88c1a5aSDimitry Andric }
624d88c1a5aSDimitry Andric
mutateCopyOp(MachineInstr & MI,const MCInstrDesc & NewDesc)625d88c1a5aSDimitry Andric static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
626d88c1a5aSDimitry Andric MI.setDesc(NewDesc);
627d88c1a5aSDimitry Andric stripExtraCopyOperands(MI);
628d88c1a5aSDimitry Andric }
629d88c1a5aSDimitry Andric
getImmOrMaterializedImm(MachineRegisterInfo & MRI,MachineOperand & Op)630f1a29dd3SDimitry Andric static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
631f1a29dd3SDimitry Andric MachineOperand &Op) {
632f1a29dd3SDimitry Andric if (Op.isReg()) {
633f1a29dd3SDimitry Andric // If this has a subregister, it obviously is a register source.
6344ba319b5SDimitry Andric if (Op.getSubReg() != AMDGPU::NoSubRegister ||
6354ba319b5SDimitry Andric !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
636f1a29dd3SDimitry Andric return &Op;
637f1a29dd3SDimitry Andric
638f1a29dd3SDimitry Andric MachineInstr *Def = MRI.getVRegDef(Op.getReg());
639edd7eaddSDimitry Andric if (Def && Def->isMoveImmediate()) {
640f1a29dd3SDimitry Andric MachineOperand &ImmSrc = Def->getOperand(1);
641f1a29dd3SDimitry Andric if (ImmSrc.isImm())
642f1a29dd3SDimitry Andric return &ImmSrc;
643f1a29dd3SDimitry Andric }
644f1a29dd3SDimitry Andric }
645f1a29dd3SDimitry Andric
646f1a29dd3SDimitry Andric return &Op;
647f1a29dd3SDimitry Andric }
648f1a29dd3SDimitry Andric
649d88c1a5aSDimitry Andric // Try to simplify operations with a constant that may appear after instruction
650d88c1a5aSDimitry Andric // selection.
651f1a29dd3SDimitry Andric // TODO: See if a frame index with a fixed offset can fold.
tryConstantFoldOp(MachineRegisterInfo & MRI,const SIInstrInfo * TII,MachineInstr * MI,MachineOperand * ImmOp)652d88c1a5aSDimitry Andric static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
653d88c1a5aSDimitry Andric const SIInstrInfo *TII,
654f1a29dd3SDimitry Andric MachineInstr *MI,
655f1a29dd3SDimitry Andric MachineOperand *ImmOp) {
656d88c1a5aSDimitry Andric unsigned Opc = MI->getOpcode();
657d88c1a5aSDimitry Andric if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
658d88c1a5aSDimitry Andric Opc == AMDGPU::S_NOT_B32) {
659f1a29dd3SDimitry Andric MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
660d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
661d88c1a5aSDimitry Andric return true;
662d88c1a5aSDimitry Andric }
663d88c1a5aSDimitry Andric
664f1a29dd3SDimitry Andric int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
665f1a29dd3SDimitry Andric if (Src1Idx == -1)
666d88c1a5aSDimitry Andric return false;
667d88c1a5aSDimitry Andric
668d88c1a5aSDimitry Andric int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
669f1a29dd3SDimitry Andric MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
670f1a29dd3SDimitry Andric MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
671d88c1a5aSDimitry Andric
672d88c1a5aSDimitry Andric if (!Src0->isImm() && !Src1->isImm())
673d88c1a5aSDimitry Andric return false;
674d88c1a5aSDimitry Andric
675b5893f02SDimitry Andric if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
676b5893f02SDimitry Andric if (Src0->isImm() && Src0->getImm() == 0) {
677b5893f02SDimitry Andric // v_lshl_or_b32 0, X, Y -> copy Y
678b5893f02SDimitry Andric // v_lshl_or_b32 0, X, K -> v_mov_b32 K
679b5893f02SDimitry Andric bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
680b5893f02SDimitry Andric MI->RemoveOperand(Src1Idx);
681b5893f02SDimitry Andric MI->RemoveOperand(Src0Idx);
682b5893f02SDimitry Andric
683b5893f02SDimitry Andric MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
684b5893f02SDimitry Andric return true;
685b5893f02SDimitry Andric }
686b5893f02SDimitry Andric }
687b5893f02SDimitry Andric
688d88c1a5aSDimitry Andric // and k0, k1 -> v_mov_b32 (k0 & k1)
689d88c1a5aSDimitry Andric // or k0, k1 -> v_mov_b32 (k0 | k1)
690d88c1a5aSDimitry Andric // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
691d88c1a5aSDimitry Andric if (Src0->isImm() && Src1->isImm()) {
692d88c1a5aSDimitry Andric int32_t NewImm;
693d88c1a5aSDimitry Andric if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
694d88c1a5aSDimitry Andric return false;
695d88c1a5aSDimitry Andric
696d88c1a5aSDimitry Andric const SIRegisterInfo &TRI = TII->getRegisterInfo();
697d88c1a5aSDimitry Andric bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
698d88c1a5aSDimitry Andric
699f1a29dd3SDimitry Andric // Be careful to change the right operand, src0 may belong to a different
700f1a29dd3SDimitry Andric // instruction.
701f1a29dd3SDimitry Andric MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
702d88c1a5aSDimitry Andric MI->RemoveOperand(Src1Idx);
703d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
704d88c1a5aSDimitry Andric return true;
705d88c1a5aSDimitry Andric }
706d88c1a5aSDimitry Andric
707f1a29dd3SDimitry Andric if (!MI->isCommutable())
708f1a29dd3SDimitry Andric return false;
709f1a29dd3SDimitry Andric
710d88c1a5aSDimitry Andric if (Src0->isImm() && !Src1->isImm()) {
711d88c1a5aSDimitry Andric std::swap(Src0, Src1);
712d88c1a5aSDimitry Andric std::swap(Src0Idx, Src1Idx);
713d88c1a5aSDimitry Andric }
714d88c1a5aSDimitry Andric
715d88c1a5aSDimitry Andric int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
716f1a29dd3SDimitry Andric if (Opc == AMDGPU::V_OR_B32_e64 ||
717f1a29dd3SDimitry Andric Opc == AMDGPU::V_OR_B32_e32 ||
718f1a29dd3SDimitry Andric Opc == AMDGPU::S_OR_B32) {
719d88c1a5aSDimitry Andric if (Src1Val == 0) {
720d88c1a5aSDimitry Andric // y = or x, 0 => y = copy x
721d88c1a5aSDimitry Andric MI->RemoveOperand(Src1Idx);
722d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
723d88c1a5aSDimitry Andric } else if (Src1Val == -1) {
724d88c1a5aSDimitry Andric // y = or x, -1 => y = v_mov_b32 -1
725d88c1a5aSDimitry Andric MI->RemoveOperand(Src1Idx);
726d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
727d88c1a5aSDimitry Andric } else
728d88c1a5aSDimitry Andric return false;
729d88c1a5aSDimitry Andric
730d88c1a5aSDimitry Andric return true;
731d88c1a5aSDimitry Andric }
732d88c1a5aSDimitry Andric
733d88c1a5aSDimitry Andric if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
734f1a29dd3SDimitry Andric MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
735d88c1a5aSDimitry Andric MI->getOpcode() == AMDGPU::S_AND_B32) {
736d88c1a5aSDimitry Andric if (Src1Val == 0) {
737d88c1a5aSDimitry Andric // y = and x, 0 => y = v_mov_b32 0
738d88c1a5aSDimitry Andric MI->RemoveOperand(Src0Idx);
739d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
740d88c1a5aSDimitry Andric } else if (Src1Val == -1) {
741d88c1a5aSDimitry Andric // y = and x, -1 => y = copy x
742d88c1a5aSDimitry Andric MI->RemoveOperand(Src1Idx);
743d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
744d88c1a5aSDimitry Andric stripExtraCopyOperands(*MI);
745d88c1a5aSDimitry Andric } else
746d88c1a5aSDimitry Andric return false;
747d88c1a5aSDimitry Andric
748d88c1a5aSDimitry Andric return true;
749d88c1a5aSDimitry Andric }
750d88c1a5aSDimitry Andric
751d88c1a5aSDimitry Andric if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
752f1a29dd3SDimitry Andric MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
753d88c1a5aSDimitry Andric MI->getOpcode() == AMDGPU::S_XOR_B32) {
754d88c1a5aSDimitry Andric if (Src1Val == 0) {
755d88c1a5aSDimitry Andric // y = xor x, 0 => y = copy x
756d88c1a5aSDimitry Andric MI->RemoveOperand(Src1Idx);
757d88c1a5aSDimitry Andric mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
758f1a29dd3SDimitry Andric return true;
759d88c1a5aSDimitry Andric }
760d88c1a5aSDimitry Andric }
761d88c1a5aSDimitry Andric
762d88c1a5aSDimitry Andric return false;
763d88c1a5aSDimitry Andric }
764d88c1a5aSDimitry Andric
7657a7e6055SDimitry Andric // Try to fold an instruction into a simpler one
tryFoldInst(const SIInstrInfo * TII,MachineInstr * MI)7667a7e6055SDimitry Andric static bool tryFoldInst(const SIInstrInfo *TII,
7677a7e6055SDimitry Andric MachineInstr *MI) {
7687a7e6055SDimitry Andric unsigned Opc = MI->getOpcode();
7697a7e6055SDimitry Andric
7707a7e6055SDimitry Andric if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
7717a7e6055SDimitry Andric Opc == AMDGPU::V_CNDMASK_B32_e64 ||
7727a7e6055SDimitry Andric Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
7737a7e6055SDimitry Andric const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
7747a7e6055SDimitry Andric const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
7757a7e6055SDimitry Andric if (Src1->isIdenticalTo(*Src0)) {
7764ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
7777a7e6055SDimitry Andric int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
7787a7e6055SDimitry Andric if (Src2Idx != -1)
7797a7e6055SDimitry Andric MI->RemoveOperand(Src2Idx);
7807a7e6055SDimitry Andric MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
7817a7e6055SDimitry Andric mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
7827a7e6055SDimitry Andric : getMovOpc(false)));
7834ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << *MI << '\n');
7847a7e6055SDimitry Andric return true;
7857a7e6055SDimitry Andric }
7867a7e6055SDimitry Andric }
7877a7e6055SDimitry Andric
7887a7e6055SDimitry Andric return false;
7897a7e6055SDimitry Andric }
7907a7e6055SDimitry Andric
foldInstOperand(MachineInstr & MI,MachineOperand & OpToFold) const791f1a29dd3SDimitry Andric void SIFoldOperands::foldInstOperand(MachineInstr &MI,
792f1a29dd3SDimitry Andric MachineOperand &OpToFold) const {
793f1a29dd3SDimitry Andric // We need mutate the operands of new mov instructions to add implicit
794f1a29dd3SDimitry Andric // uses of EXEC, but adding them invalidates the use_iterator, so defer
795f1a29dd3SDimitry Andric // this.
796f1a29dd3SDimitry Andric SmallVector<MachineInstr *, 4> CopiesToReplace;
797f1a29dd3SDimitry Andric SmallVector<FoldCandidate, 4> FoldList;
798f1a29dd3SDimitry Andric MachineOperand &Dst = MI.getOperand(0);
799f1a29dd3SDimitry Andric
800f1a29dd3SDimitry Andric bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
801f1a29dd3SDimitry Andric if (FoldingImm) {
802f1a29dd3SDimitry Andric unsigned NumLiteralUses = 0;
803f1a29dd3SDimitry Andric MachineOperand *NonInlineUse = nullptr;
804f1a29dd3SDimitry Andric int NonInlineUseOpNo = -1;
805f1a29dd3SDimitry Andric
8062cab237bSDimitry Andric MachineRegisterInfo::use_iterator NextUse;
807f1a29dd3SDimitry Andric for (MachineRegisterInfo::use_iterator
808f1a29dd3SDimitry Andric Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
809f1a29dd3SDimitry Andric Use != E; Use = NextUse) {
810f1a29dd3SDimitry Andric NextUse = std::next(Use);
811f1a29dd3SDimitry Andric MachineInstr *UseMI = Use->getParent();
812f1a29dd3SDimitry Andric unsigned OpNo = Use.getOperandNo();
813f1a29dd3SDimitry Andric
814f1a29dd3SDimitry Andric // Folding the immediate may reveal operations that can be constant
815f1a29dd3SDimitry Andric // folded or replaced with a copy. This can happen for example after
816f1a29dd3SDimitry Andric // frame indices are lowered to constants or from splitting 64-bit
817f1a29dd3SDimitry Andric // constants.
818f1a29dd3SDimitry Andric //
819f1a29dd3SDimitry Andric // We may also encounter cases where one or both operands are
820f1a29dd3SDimitry Andric // immediates materialized into a register, which would ordinarily not
821f1a29dd3SDimitry Andric // be folded due to multiple uses or operand constraints.
822f1a29dd3SDimitry Andric
823f1a29dd3SDimitry Andric if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
8244ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
825f1a29dd3SDimitry Andric
826f1a29dd3SDimitry Andric // Some constant folding cases change the same immediate's use to a new
827f1a29dd3SDimitry Andric // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
828f1a29dd3SDimitry Andric // again. The same constant folded instruction could also have a second
829f1a29dd3SDimitry Andric // use operand.
830f1a29dd3SDimitry Andric NextUse = MRI->use_begin(Dst.getReg());
831b40b48b8SDimitry Andric FoldList.clear();
832f1a29dd3SDimitry Andric continue;
833f1a29dd3SDimitry Andric }
834f1a29dd3SDimitry Andric
835f1a29dd3SDimitry Andric // Try to fold any inline immediate uses, and then only fold other
836f1a29dd3SDimitry Andric // constants if they have one use.
837f1a29dd3SDimitry Andric //
838f1a29dd3SDimitry Andric // The legality of the inline immediate must be checked based on the use
839f1a29dd3SDimitry Andric // operand, not the defining instruction, because 32-bit instructions
840f1a29dd3SDimitry Andric // with 32-bit inline immediate sources may be used to materialize
841f1a29dd3SDimitry Andric // constants used in 16-bit operands.
842f1a29dd3SDimitry Andric //
843f1a29dd3SDimitry Andric // e.g. it is unsafe to fold:
844f1a29dd3SDimitry Andric // s_mov_b32 s0, 1.0 // materializes 0x3f800000
845f1a29dd3SDimitry Andric // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
846f1a29dd3SDimitry Andric
847f1a29dd3SDimitry Andric // Folding immediates with more than one use will increase program size.
848f1a29dd3SDimitry Andric // FIXME: This will also reduce register usage, which may be better
849f1a29dd3SDimitry Andric // in some cases. A better heuristic is needed.
850f1a29dd3SDimitry Andric if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
851f1a29dd3SDimitry Andric foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
852f1a29dd3SDimitry Andric } else {
853f1a29dd3SDimitry Andric if (++NumLiteralUses == 1) {
854f1a29dd3SDimitry Andric NonInlineUse = &*Use;
855f1a29dd3SDimitry Andric NonInlineUseOpNo = OpNo;
856f1a29dd3SDimitry Andric }
857f1a29dd3SDimitry Andric }
858f1a29dd3SDimitry Andric }
859f1a29dd3SDimitry Andric
860f1a29dd3SDimitry Andric if (NumLiteralUses == 1) {
861f1a29dd3SDimitry Andric MachineInstr *UseMI = NonInlineUse->getParent();
862f1a29dd3SDimitry Andric foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
863f1a29dd3SDimitry Andric }
864f1a29dd3SDimitry Andric } else {
865f1a29dd3SDimitry Andric // Folding register.
866b5893f02SDimitry Andric SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
867f1a29dd3SDimitry Andric for (MachineRegisterInfo::use_iterator
868f1a29dd3SDimitry Andric Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
869f1a29dd3SDimitry Andric Use != E; ++Use) {
870b5893f02SDimitry Andric UsesToProcess.push_back(Use);
871b5893f02SDimitry Andric }
872b5893f02SDimitry Andric for (auto U : UsesToProcess) {
873b5893f02SDimitry Andric MachineInstr *UseMI = U->getParent();
874f1a29dd3SDimitry Andric
875b5893f02SDimitry Andric foldOperand(OpToFold, UseMI, U.getOperandNo(),
876f1a29dd3SDimitry Andric FoldList, CopiesToReplace);
877f1a29dd3SDimitry Andric }
878f1a29dd3SDimitry Andric }
879f1a29dd3SDimitry Andric
880f1a29dd3SDimitry Andric MachineFunction *MF = MI.getParent()->getParent();
881f1a29dd3SDimitry Andric // Make sure we add EXEC uses to any new v_mov instructions created.
882f1a29dd3SDimitry Andric for (MachineInstr *Copy : CopiesToReplace)
883f1a29dd3SDimitry Andric Copy->addImplicitDefUseOperands(*MF);
884f1a29dd3SDimitry Andric
885f1a29dd3SDimitry Andric for (FoldCandidate &Fold : FoldList) {
886b5893f02SDimitry Andric if (updateOperand(Fold, *TII, *TRI)) {
887f1a29dd3SDimitry Andric // Clear kill flags.
888f1a29dd3SDimitry Andric if (Fold.isReg()) {
889f1a29dd3SDimitry Andric assert(Fold.OpToFold && Fold.OpToFold->isReg());
890f1a29dd3SDimitry Andric // FIXME: Probably shouldn't bother trying to fold if not an
891f1a29dd3SDimitry Andric // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
892f1a29dd3SDimitry Andric // copies.
893f1a29dd3SDimitry Andric MRI->clearKillFlags(Fold.OpToFold->getReg());
894f1a29dd3SDimitry Andric }
8954ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
8964ba319b5SDimitry Andric << static_cast<int>(Fold.UseOpNo) << " of "
8974ba319b5SDimitry Andric << *Fold.UseMI << '\n');
8987a7e6055SDimitry Andric tryFoldInst(TII, Fold.UseMI);
8996d97bb29SDimitry Andric } else if (Fold.isCommuted()) {
9006d97bb29SDimitry Andric // Restoring instruction's original operand order if fold has failed.
9016d97bb29SDimitry Andric TII->commuteInstruction(*Fold.UseMI, false);
902f1a29dd3SDimitry Andric }
903f1a29dd3SDimitry Andric }
904f1a29dd3SDimitry Andric }
905f1a29dd3SDimitry Andric
9062cab237bSDimitry Andric // Clamp patterns are canonically selected to v_max_* instructions, so only
9072cab237bSDimitry Andric // handle them.
isClamp(const MachineInstr & MI) const9087a7e6055SDimitry Andric const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
9097a7e6055SDimitry Andric unsigned Op = MI.getOpcode();
9107a7e6055SDimitry Andric switch (Op) {
9117a7e6055SDimitry Andric case AMDGPU::V_MAX_F32_e64:
9127a7e6055SDimitry Andric case AMDGPU::V_MAX_F16_e64:
9132cab237bSDimitry Andric case AMDGPU::V_MAX_F64:
9142cab237bSDimitry Andric case AMDGPU::V_PK_MAX_F16: {
9157a7e6055SDimitry Andric if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
9167a7e6055SDimitry Andric return nullptr;
9177a7e6055SDimitry Andric
9187a7e6055SDimitry Andric // Make sure sources are identical.
9197a7e6055SDimitry Andric const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
9207a7e6055SDimitry Andric const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
921db17bf38SDimitry Andric if (!Src0->isReg() || !Src1->isReg() ||
9222cab237bSDimitry Andric Src0->getReg() != Src1->getReg() ||
923db17bf38SDimitry Andric Src0->getSubReg() != Src1->getSubReg() ||
9247a7e6055SDimitry Andric Src0->getSubReg() != AMDGPU::NoSubRegister)
9257a7e6055SDimitry Andric return nullptr;
9267a7e6055SDimitry Andric
9277a7e6055SDimitry Andric // Can't fold up if we have modifiers.
9282cab237bSDimitry Andric if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
9292cab237bSDimitry Andric return nullptr;
9302cab237bSDimitry Andric
9312cab237bSDimitry Andric unsigned Src0Mods
9322cab237bSDimitry Andric = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
9332cab237bSDimitry Andric unsigned Src1Mods
9342cab237bSDimitry Andric = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
9352cab237bSDimitry Andric
9362cab237bSDimitry Andric // Having a 0 op_sel_hi would require swizzling the output in the source
9372cab237bSDimitry Andric // instruction, which we can't do.
9382cab237bSDimitry Andric unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
9392cab237bSDimitry Andric if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
9407a7e6055SDimitry Andric return nullptr;
9417a7e6055SDimitry Andric return Src0;
9427a7e6055SDimitry Andric }
9437a7e6055SDimitry Andric default:
9447a7e6055SDimitry Andric return nullptr;
9457a7e6055SDimitry Andric }
9467a7e6055SDimitry Andric }
9477a7e6055SDimitry Andric
9487a7e6055SDimitry Andric // We obviously have multiple uses in a clamp since the register is used twice
9497a7e6055SDimitry Andric // in the same instruction.
hasOneNonDBGUseInst(const MachineRegisterInfo & MRI,unsigned Reg)9507a7e6055SDimitry Andric static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
9517a7e6055SDimitry Andric int Count = 0;
9527a7e6055SDimitry Andric for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
9537a7e6055SDimitry Andric I != E; ++I) {
9547a7e6055SDimitry Andric if (++Count > 1)
9557a7e6055SDimitry Andric return false;
9567a7e6055SDimitry Andric }
9577a7e6055SDimitry Andric
9587a7e6055SDimitry Andric return true;
9597a7e6055SDimitry Andric }
9607a7e6055SDimitry Andric
9612cab237bSDimitry Andric // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
tryFoldClamp(MachineInstr & MI)9627a7e6055SDimitry Andric bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
9637a7e6055SDimitry Andric const MachineOperand *ClampSrc = isClamp(MI);
9647a7e6055SDimitry Andric if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
9657a7e6055SDimitry Andric return false;
9667a7e6055SDimitry Andric
9677a7e6055SDimitry Andric MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
9682cab237bSDimitry Andric
9692cab237bSDimitry Andric // The type of clamp must be compatible.
9702cab237bSDimitry Andric if (TII->getClampMask(*Def) != TII->getClampMask(MI))
9717a7e6055SDimitry Andric return false;
9722cab237bSDimitry Andric
9737a7e6055SDimitry Andric MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
9747a7e6055SDimitry Andric if (!DefClamp)
9757a7e6055SDimitry Andric return false;
9767a7e6055SDimitry Andric
9774ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
9784ba319b5SDimitry Andric << '\n');
9797a7e6055SDimitry Andric
9807a7e6055SDimitry Andric // Clamp is applied after omod, so it is OK if omod is set.
9817a7e6055SDimitry Andric DefClamp->setImm(1);
9827a7e6055SDimitry Andric MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
9837a7e6055SDimitry Andric MI.eraseFromParent();
9847a7e6055SDimitry Andric return true;
9857a7e6055SDimitry Andric }
9867a7e6055SDimitry Andric
getOModValue(unsigned Opc,int64_t Val)9877a7e6055SDimitry Andric static int getOModValue(unsigned Opc, int64_t Val) {
9887a7e6055SDimitry Andric switch (Opc) {
9897a7e6055SDimitry Andric case AMDGPU::V_MUL_F32_e64: {
9907a7e6055SDimitry Andric switch (static_cast<uint32_t>(Val)) {
9917a7e6055SDimitry Andric case 0x3f000000: // 0.5
9927a7e6055SDimitry Andric return SIOutMods::DIV2;
9937a7e6055SDimitry Andric case 0x40000000: // 2.0
9947a7e6055SDimitry Andric return SIOutMods::MUL2;
9957a7e6055SDimitry Andric case 0x40800000: // 4.0
9967a7e6055SDimitry Andric return SIOutMods::MUL4;
9977a7e6055SDimitry Andric default:
9987a7e6055SDimitry Andric return SIOutMods::NONE;
9997a7e6055SDimitry Andric }
10007a7e6055SDimitry Andric }
10017a7e6055SDimitry Andric case AMDGPU::V_MUL_F16_e64: {
10027a7e6055SDimitry Andric switch (static_cast<uint16_t>(Val)) {
10037a7e6055SDimitry Andric case 0x3800: // 0.5
10047a7e6055SDimitry Andric return SIOutMods::DIV2;
10057a7e6055SDimitry Andric case 0x4000: // 2.0
10067a7e6055SDimitry Andric return SIOutMods::MUL2;
10077a7e6055SDimitry Andric case 0x4400: // 4.0
10087a7e6055SDimitry Andric return SIOutMods::MUL4;
10097a7e6055SDimitry Andric default:
10107a7e6055SDimitry Andric return SIOutMods::NONE;
10117a7e6055SDimitry Andric }
10127a7e6055SDimitry Andric }
10137a7e6055SDimitry Andric default:
10147a7e6055SDimitry Andric llvm_unreachable("invalid mul opcode");
10157a7e6055SDimitry Andric }
10167a7e6055SDimitry Andric }
10177a7e6055SDimitry Andric
10187a7e6055SDimitry Andric // FIXME: Does this really not support denormals with f16?
10197a7e6055SDimitry Andric // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
10207a7e6055SDimitry Andric // handled, so will anything other than that break?
10217a7e6055SDimitry Andric std::pair<const MachineOperand *, int>
isOMod(const MachineInstr & MI) const10227a7e6055SDimitry Andric SIFoldOperands::isOMod(const MachineInstr &MI) const {
10237a7e6055SDimitry Andric unsigned Op = MI.getOpcode();
10247a7e6055SDimitry Andric switch (Op) {
10257a7e6055SDimitry Andric case AMDGPU::V_MUL_F32_e64:
10267a7e6055SDimitry Andric case AMDGPU::V_MUL_F16_e64: {
10277a7e6055SDimitry Andric // If output denormals are enabled, omod is ignored.
10287a7e6055SDimitry Andric if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
10297a7e6055SDimitry Andric (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
10307a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10317a7e6055SDimitry Andric
10327a7e6055SDimitry Andric const MachineOperand *RegOp = nullptr;
10337a7e6055SDimitry Andric const MachineOperand *ImmOp = nullptr;
10347a7e6055SDimitry Andric const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
10357a7e6055SDimitry Andric const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
10367a7e6055SDimitry Andric if (Src0->isImm()) {
10377a7e6055SDimitry Andric ImmOp = Src0;
10387a7e6055SDimitry Andric RegOp = Src1;
10397a7e6055SDimitry Andric } else if (Src1->isImm()) {
10407a7e6055SDimitry Andric ImmOp = Src1;
10417a7e6055SDimitry Andric RegOp = Src0;
10427a7e6055SDimitry Andric } else
10437a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10447a7e6055SDimitry Andric
10457a7e6055SDimitry Andric int OMod = getOModValue(Op, ImmOp->getImm());
10467a7e6055SDimitry Andric if (OMod == SIOutMods::NONE ||
10477a7e6055SDimitry Andric TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
10487a7e6055SDimitry Andric TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
10497a7e6055SDimitry Andric TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
10507a7e6055SDimitry Andric TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
10517a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10527a7e6055SDimitry Andric
10537a7e6055SDimitry Andric return std::make_pair(RegOp, OMod);
10547a7e6055SDimitry Andric }
10557a7e6055SDimitry Andric case AMDGPU::V_ADD_F32_e64:
10567a7e6055SDimitry Andric case AMDGPU::V_ADD_F16_e64: {
10577a7e6055SDimitry Andric // If output denormals are enabled, omod is ignored.
10587a7e6055SDimitry Andric if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
10597a7e6055SDimitry Andric (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
10607a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10617a7e6055SDimitry Andric
10627a7e6055SDimitry Andric // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
10637a7e6055SDimitry Andric const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
10647a7e6055SDimitry Andric const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
10657a7e6055SDimitry Andric
10667a7e6055SDimitry Andric if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
10677a7e6055SDimitry Andric Src0->getSubReg() == Src1->getSubReg() &&
10687a7e6055SDimitry Andric !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
10697a7e6055SDimitry Andric !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
10707a7e6055SDimitry Andric !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
10717a7e6055SDimitry Andric !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
10727a7e6055SDimitry Andric return std::make_pair(Src0, SIOutMods::MUL2);
10737a7e6055SDimitry Andric
10747a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10757a7e6055SDimitry Andric }
10767a7e6055SDimitry Andric default:
10777a7e6055SDimitry Andric return std::make_pair(nullptr, SIOutMods::NONE);
10787a7e6055SDimitry Andric }
10797a7e6055SDimitry Andric }
10807a7e6055SDimitry Andric
10817a7e6055SDimitry Andric // FIXME: Does this need to check IEEE bit on function?
tryFoldOMod(MachineInstr & MI)10827a7e6055SDimitry Andric bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
10837a7e6055SDimitry Andric const MachineOperand *RegOp;
10847a7e6055SDimitry Andric int OMod;
10857a7e6055SDimitry Andric std::tie(RegOp, OMod) = isOMod(MI);
10867a7e6055SDimitry Andric if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
10877a7e6055SDimitry Andric RegOp->getSubReg() != AMDGPU::NoSubRegister ||
10887a7e6055SDimitry Andric !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
10897a7e6055SDimitry Andric return false;
10907a7e6055SDimitry Andric
10917a7e6055SDimitry Andric MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
10927a7e6055SDimitry Andric MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
10937a7e6055SDimitry Andric if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
10947a7e6055SDimitry Andric return false;
10957a7e6055SDimitry Andric
10967a7e6055SDimitry Andric // Clamp is applied after omod. If the source already has clamp set, don't
10977a7e6055SDimitry Andric // fold it.
10987a7e6055SDimitry Andric if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
10997a7e6055SDimitry Andric return false;
11007a7e6055SDimitry Andric
11014ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
11027a7e6055SDimitry Andric
11037a7e6055SDimitry Andric DefOMod->setImm(OMod);
11047a7e6055SDimitry Andric MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
11057a7e6055SDimitry Andric MI.eraseFromParent();
11067a7e6055SDimitry Andric return true;
11077a7e6055SDimitry Andric }
11087a7e6055SDimitry Andric
runOnMachineFunction(MachineFunction & MF)11098f0fd8f6SDimitry Andric bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
11102cab237bSDimitry Andric if (skipFunction(MF.getFunction()))
11113ca95b02SDimitry Andric return false;
11123ca95b02SDimitry Andric
1113f1a29dd3SDimitry Andric MRI = &MF.getRegInfo();
11144ba319b5SDimitry Andric ST = &MF.getSubtarget<GCNSubtarget>();
11157a7e6055SDimitry Andric TII = ST->getInstrInfo();
1116f1a29dd3SDimitry Andric TRI = &TII->getRegisterInfo();
11178f0fd8f6SDimitry Andric
11187a7e6055SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11197a7e6055SDimitry Andric
11207a7e6055SDimitry Andric // omod is ignored by hardware if IEEE bit is enabled. omod also does not
11217a7e6055SDimitry Andric // correctly handle signed zeros.
11227a7e6055SDimitry Andric //
1123b5893f02SDimitry Andric bool IsIEEEMode = ST->enableIEEEBit(MF);
1124b5893f02SDimitry Andric bool HasNSZ = MFI->hasNoSignedZerosFPMath();
11257a7e6055SDimitry Andric
1126edd7eaddSDimitry Andric for (MachineBasicBlock *MBB : depth_first(&MF)) {
11278f0fd8f6SDimitry Andric MachineBasicBlock::iterator I, Next;
1128edd7eaddSDimitry Andric for (I = MBB->begin(); I != MBB->end(); I = Next) {
11298f0fd8f6SDimitry Andric Next = std::next(I);
11308f0fd8f6SDimitry Andric MachineInstr &MI = *I;
11318f0fd8f6SDimitry Andric
11327a7e6055SDimitry Andric tryFoldInst(TII, &MI);
11337a7e6055SDimitry Andric
11347a7e6055SDimitry Andric if (!TII->isFoldableCopy(MI)) {
1135b5893f02SDimitry Andric // TODO: Omod might be OK if there is NSZ only on the source
1136b5893f02SDimitry Andric // instruction, and not the omod multiply.
1137b5893f02SDimitry Andric if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1138b5893f02SDimitry Andric !tryFoldOMod(MI))
11397a7e6055SDimitry Andric tryFoldClamp(MI);
11408f0fd8f6SDimitry Andric continue;
11417a7e6055SDimitry Andric }
11428f0fd8f6SDimitry Andric
11438f0fd8f6SDimitry Andric MachineOperand &OpToFold = MI.getOperand(1);
1144d88c1a5aSDimitry Andric bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
11458f0fd8f6SDimitry Andric
1146f1a29dd3SDimitry Andric // FIXME: We could also be folding things like TargetIndexes.
11478f0fd8f6SDimitry Andric if (!FoldingImm && !OpToFold.isReg())
11488f0fd8f6SDimitry Andric continue;
11498f0fd8f6SDimitry Andric
11508f0fd8f6SDimitry Andric if (OpToFold.isReg() &&
1151444ed5c5SDimitry Andric !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
11528f0fd8f6SDimitry Andric continue;
11538f0fd8f6SDimitry Andric
1154444ed5c5SDimitry Andric // Prevent folding operands backwards in the function. For example,
1155444ed5c5SDimitry Andric // the COPY opcode must not be replaced by 1 in this example:
1156444ed5c5SDimitry Andric //
11572cab237bSDimitry Andric // %3 = COPY %vgpr0; VGPR_32:%3
1158444ed5c5SDimitry Andric // ...
11592cab237bSDimitry Andric // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1160444ed5c5SDimitry Andric MachineOperand &Dst = MI.getOperand(0);
1161444ed5c5SDimitry Andric if (Dst.isReg() &&
1162444ed5c5SDimitry Andric !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
1163444ed5c5SDimitry Andric continue;
11647d523365SDimitry Andric
1165f1a29dd3SDimitry Andric foldInstOperand(MI, OpToFold);
11668f0fd8f6SDimitry Andric }
11678f0fd8f6SDimitry Andric }
11688f0fd8f6SDimitry Andric return false;
11698f0fd8f6SDimitry Andric }
1170