1 //===-- SIPreEmitPeephole.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass performs the peephole optimizations before code emission. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/Support/CommandLine.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "si-pre-emit-peephole" 25 26 namespace { 27 28 class SIPreEmitPeephole : public MachineFunctionPass { 29 private: 30 const SIInstrInfo *TII = nullptr; 31 const SIRegisterInfo *TRI = nullptr; 32 33 bool optimizeVccBranch(MachineInstr &MI) const; 34 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; 35 36 public: 37 static char ID; 38 39 SIPreEmitPeephole() : MachineFunctionPass(ID) { 40 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); 41 } 42 43 bool runOnMachineFunction(MachineFunction &MF) override; 44 }; 45 46 } // End anonymous namespace. 47 48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, 49 "SI peephole optimizations", false, false) 50 51 char SIPreEmitPeephole::ID = 0; 52 53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; 54 55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { 56 // Match: 57 // sreg = -1 58 // vcc = S_AND_B64 exec, sreg 59 // S_CBRANCH_VCC[N]Z 60 // => 61 // S_CBRANCH_EXEC[N]Z 62 // We end up with this pattern sometimes after basic block placement. 63 // It happens while combining a block which assigns -1 to a saved mask and 64 // another block which consumes that saved mask and then a branch. 65 bool Changed = false; 66 MachineBasicBlock &MBB = *MI.getParent(); 67 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 68 const bool IsWave32 = ST.isWave32(); 69 const unsigned CondReg = TRI->getVCC(); 70 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 71 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 72 73 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 74 E = MBB.rend(); 75 bool ReadsCond = false; 76 unsigned Threshold = 5; 77 for (++A; A != E; ++A) { 78 if (!--Threshold) 79 return false; 80 if (A->modifiesRegister(ExecReg, TRI)) 81 return false; 82 if (A->modifiesRegister(CondReg, TRI)) { 83 if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) 84 return false; 85 break; 86 } 87 ReadsCond |= A->readsRegister(CondReg, TRI); 88 } 89 if (A == E) 90 return false; 91 92 MachineOperand &Op1 = A->getOperand(1); 93 MachineOperand &Op2 = A->getOperand(2); 94 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 95 TII->commuteInstruction(*A); 96 Changed = true; 97 } 98 if (Op1.getReg() != ExecReg) 99 return Changed; 100 if (Op2.isImm() && Op2.getImm() != -1) 101 return Changed; 102 103 Register SReg; 104 if (Op2.isReg()) { 105 SReg = Op2.getReg(); 106 auto M = std::next(A); 107 bool ReadsSreg = false; 108 for (; M != E; ++M) { 109 if (M->definesRegister(SReg, TRI)) 110 break; 111 if (M->modifiesRegister(SReg, TRI)) 112 return Changed; 113 ReadsSreg |= M->readsRegister(SReg, TRI); 114 } 115 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || 116 M->getOperand(1).getImm() != -1) 117 return Changed; 118 // First if sreg is only used in and instruction fold the immediate 119 // into that and. 120 if (!ReadsSreg && Op2.isKill()) { 121 A->getOperand(2).ChangeToImmediate(-1); 122 M->eraseFromParent(); 123 } 124 } 125 126 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && 127 MI.killsRegister(CondReg, TRI)) 128 A->eraseFromParent(); 129 130 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 131 if (SReg == ExecReg) { 132 if (IsVCCZ) { 133 MI.eraseFromParent(); 134 return true; 135 } 136 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 137 } else { 138 MI.setDesc( 139 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); 140 } 141 142 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); 143 MI.addImplicitDefUseOperands(*MBB.getParent()); 144 145 return true; 146 } 147 148 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, 149 MachineInstr &MI) const { 150 MachineBasicBlock &MBB = *MI.getParent(); 151 const MachineFunction &MF = *MBB.getParent(); 152 const MachineRegisterInfo &MRI = MF.getRegInfo(); 153 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 154 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); 155 SmallVector<MachineInstr *, 4> ToRemove; 156 bool IdxOn = true; 157 158 if (!MI.isIdenticalTo(First)) 159 return false; 160 161 // Scan back to find an identical S_SET_GPR_IDX_ON 162 for (MachineBasicBlock::iterator I = std::next(First.getIterator()), 163 E = MI.getIterator(); I != E; ++I) { 164 switch (I->getOpcode()) { 165 case AMDGPU::S_SET_GPR_IDX_MODE: 166 return false; 167 case AMDGPU::S_SET_GPR_IDX_OFF: 168 IdxOn = false; 169 ToRemove.push_back(&*I); 170 break; 171 default: 172 if (I->modifiesRegister(AMDGPU::M0, TRI)) 173 return false; 174 if (IdxReg && I->modifiesRegister(IdxReg, TRI)) 175 return false; 176 if (llvm::any_of(I->operands(), 177 [&MRI, this](const MachineOperand &MO) { 178 return MO.isReg() && 179 TRI->isVectorRegister(MRI, MO.getReg()); 180 })) { 181 // The only exception allowed here is another indirect vector move 182 // with the same mode. 183 if (!IdxOn || 184 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 && 185 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) || 186 I->getOpcode() == AMDGPU::V_MOV_B32_indirect)) 187 return false; 188 } 189 } 190 } 191 192 MI.eraseFromParent(); 193 for (MachineInstr *RI : ToRemove) 194 RI->eraseFromParent(); 195 return true; 196 } 197 198 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { 199 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 200 TII = ST.getInstrInfo(); 201 TRI = &TII->getRegisterInfo(); 202 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 203 bool Changed = false; 204 205 for (MachineBasicBlock &MBB : MF) { 206 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); 207 if (MBBE != MBB.end()) { 208 MachineInstr &MI = *MBBE; 209 switch (MI.getOpcode()) { 210 case AMDGPU::S_CBRANCH_VCCZ: 211 case AMDGPU::S_CBRANCH_VCCNZ: 212 Changed |= optimizeVccBranch(MI); 213 continue; 214 case AMDGPU::SI_RETURN_TO_EPILOG: 215 // FIXME: This is not an optimization and should be 216 // moved somewhere else. 217 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 218 219 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 220 // because external bytecode will be appended at the end. 221 if (&MBB != &MF.back() || &MI != &MBB.back()) { 222 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block 223 // at the end and jump there. 224 if (!EmptyMBBAtEnd) { 225 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 226 MF.insert(MF.end(), EmptyMBBAtEnd); 227 } 228 229 MBB.addSuccessor(EmptyMBBAtEnd); 230 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 231 .addMBB(EmptyMBBAtEnd); 232 MI.eraseFromParent(); 233 MBBE = MBB.getFirstTerminator(); 234 } 235 break; 236 default: 237 break; 238 } 239 } 240 241 if (!ST.hasVGPRIndexMode()) 242 continue; 243 244 MachineInstr *SetGPRMI = nullptr; 245 const unsigned Threshold = 20; 246 unsigned Count = 0; 247 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a 248 // second is not needed. Do expensive checks in the optimizeSetGPR() 249 // and limit the distance to 20 instructions for compile time purposes. 250 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { 251 MachineInstr &MI = *MBBI; 252 ++MBBI; 253 254 if (Count == Threshold) 255 SetGPRMI = nullptr; 256 else 257 ++Count; 258 259 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) 260 continue; 261 262 Count = 0; 263 if (!SetGPRMI) { 264 SetGPRMI = &MI; 265 continue; 266 } 267 268 if (optimizeSetGPR(*SetGPRMI, MI)) 269 Changed = true; 270 else 271 SetGPRMI = &MI; 272 } 273 } 274 275 return Changed; 276 } 277