1 //===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the AMDGPU DAG scheduling
10 /// mutation to pair VOPD instructions back to back. It also contains
11 // subroutines useful in the creation of VOPD instructions
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "GCNVOPDUtils.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIInstrInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/MacroFusion.h"
27 #include "llvm/CodeGen/ScheduleDAG.h"
28 #include "llvm/CodeGen/ScheduleDAGMutation.h"
29 #include "llvm/CodeGen/TargetInstrInfo.h"
30 #include "llvm/MC/MCInst.h"
31
32 using namespace llvm;
33
34 #define DEBUG_TYPE "gcn-vopd-utils"
35
checkVOPDRegConstraints(const SIInstrInfo & TII,const MachineInstr & FirstMI,const MachineInstr & SecondMI)36 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
37 const MachineInstr &FirstMI,
38 const MachineInstr &SecondMI) {
39 const MachineFunction *MF = FirstMI.getMF();
40 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
41 const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
42 const MachineRegisterInfo &MRI = MF->getRegInfo();
43 const unsigned NumVGPRBanks = 4;
44 // Literals also count against scalar bus limit
45 SmallVector<const MachineOperand *> UniqueLiterals;
46 auto addLiteral = [&](const MachineOperand &Op) {
47 for (auto &Literal : UniqueLiterals) {
48 if (Literal->isIdenticalTo(Op))
49 return;
50 }
51 UniqueLiterals.push_back(&Op);
52 };
53 SmallVector<Register> UniqueScalarRegs;
54 assert([&]() -> bool {
55 for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
56 MII != FirstMI.getParent()->instr_end(); ++MII) {
57 if (&*MII == &SecondMI)
58 return true;
59 }
60 return false;
61 }() && "Expected FirstMI to precede SecondMI");
62 // Cannot pair dependent instructions
63 for (const auto &Use : SecondMI.uses())
64 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
65 return false;
66
67 struct ComponentInfo {
68 ComponentInfo(const MachineInstr &MI) : MI(MI) {}
69 Register Dst, Reg0, Reg1, Reg2;
70 const MachineInstr &MI;
71 };
72 ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)};
73
74 for (ComponentInfo &Comp : CInfo) {
75 switch (Comp.MI.getOpcode()) {
76 case AMDGPU::V_FMAMK_F32:
77 // cannot inline the fixed literal in fmamk
78 addLiteral(Comp.MI.getOperand(2));
79 Comp.Reg2 = Comp.MI.getOperand(3).getReg();
80 break;
81 case AMDGPU::V_FMAAK_F32:
82 // cannot inline the fixed literal in fmaak
83 addLiteral(Comp.MI.getOperand(3));
84 Comp.Reg1 = Comp.MI.getOperand(2).getReg();
85 break;
86 case AMDGPU::V_FMAC_F32_e32:
87 case AMDGPU::V_DOT2_F32_F16:
88 case AMDGPU::V_DOT2_F32_BF16:
89 Comp.Reg1 = Comp.MI.getOperand(2).getReg();
90 Comp.Reg2 = Comp.MI.getOperand(0).getReg();
91 break;
92 case AMDGPU::V_CNDMASK_B32_e32:
93 UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
94 Comp.Reg1 = Comp.MI.getOperand(2).getReg();
95 break;
96 case AMDGPU::V_MOV_B32_e32:
97 break;
98 default:
99 Comp.Reg1 = Comp.MI.getOperand(2).getReg();
100 break;
101 }
102
103 Comp.Dst = Comp.MI.getOperand(0).getReg();
104
105 const MachineOperand &Op0 = Comp.MI.getOperand(1);
106 if (Op0.isReg()) {
107 if (!TRI->isVectorRegister(MRI, Op0.getReg())) {
108 if (!is_contained(UniqueScalarRegs, Op0.getReg()))
109 UniqueScalarRegs.push_back(Op0.getReg());
110 } else
111 Comp.Reg0 = Op0.getReg();
112 } else {
113 if (!TII.isInlineConstant(Comp.MI, 1))
114 addLiteral(Op0);
115 }
116 }
117
118 if (UniqueLiterals.size() > 1)
119 return false;
120 if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
121 return false;
122
123 // check port 0
124 if (CInfo[0].Reg0 && CInfo[1].Reg0 &&
125 CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks)
126 return false;
127 // check port 1
128 if (CInfo[0].Reg1 && CInfo[1].Reg1 &&
129 CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks)
130 return false;
131 // check port 2
132 if (CInfo[0].Reg2 && CInfo[1].Reg2 &&
133 !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1))
134 return false;
135 if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1))
136 return false;
137
138 LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
139 << "\n\tY: " << SecondMI << "\n");
140 return true;
141 }
142
143 /// Check if the instr pair, FirstMI and SecondMI, should be scheduled
144 /// together. Given SecondMI, when FirstMI is unspecified, then check if
145 /// SecondMI may be part of a fused pair at all.
shouldScheduleVOPDAdjacent(const TargetInstrInfo & TII,const TargetSubtargetInfo & TSI,const MachineInstr * FirstMI,const MachineInstr & SecondMI)146 static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
147 const TargetSubtargetInfo &TSI,
148 const MachineInstr *FirstMI,
149 const MachineInstr &SecondMI) {
150 const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
151 unsigned Opc2 = SecondMI.getOpcode();
152 auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
153
154 // One instruction case
155 if (!FirstMI)
156 return SecondCanBeVOPD.Y;
157
158 unsigned Opc = FirstMI->getOpcode();
159 auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
160
161 if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
162 (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
163 return false;
164
165 return checkVOPDRegConstraints(STII, *FirstMI, SecondMI);
166 }
167
168 /// Adapts design from MacroFusion
169 /// Puts valid candidate instructions back-to-back so they can easily
170 /// be turned into VOPD instructions
171 /// Greedily pairs instruction candidates. O(n^2) algorithm.
172 struct VOPDPairingMutation : ScheduleDAGMutation {
173 ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer
174
VOPDPairingMutationVOPDPairingMutation175 VOPDPairingMutation(
176 ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer
177 : shouldScheduleAdjacent(shouldScheduleAdjacent) {}
178
applyVOPDPairingMutation179 void apply(ScheduleDAGInstrs *DAG) override {
180 const TargetInstrInfo &TII = *DAG->TII;
181 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
182 if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
183 LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
184 return;
185 }
186
187 std::vector<SUnit>::iterator ISUI, JSUI;
188 for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
189 const MachineInstr *IMI = ISUI->getInstr();
190 if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
191 continue;
192 if (!hasLessThanNumFused(*ISUI, 2))
193 continue;
194
195 for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
196 if (JSUI->isBoundaryNode())
197 continue;
198 const MachineInstr *JMI = JSUI->getInstr();
199 if (!hasLessThanNumFused(*JSUI, 2) ||
200 !shouldScheduleAdjacent(TII, ST, IMI, *JMI))
201 continue;
202 if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
203 break;
204 }
205 }
206 LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
207 }
208 };
209
createVOPDPairingMutation()210 std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
211 return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
212 }
213